481 files changed, 33197 insertions, 42361 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 381248e9bf1..17096d441f0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -247,7 +247,7 @@ if(WITH_CYCLES_OSL)
 endif()
 
 if(WITH_CYCLES_DEVICE_OPTIX)
-  find_package(OptiX)
+  find_package(OptiX 7.3.0)
 
   if(OPTIX_FOUND)
     add_definitions(-DWITH_OPTIX)
@@ -286,11 +286,17 @@ if(WITH_OPENSUBDIV)
   )
 endif()
 
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  add_definitions(-DOIDN_STATIC_LIB)
+  include_directories(
+    SYSTEM
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+endif()
+
 if(WITH_CYCLES_STANDALONE)
-  set(WITH_CYCLES_DEVICE_OPENCL TRUE)
   set(WITH_CYCLES_DEVICE_CUDA TRUE)
-  # Experimental and unfinished.
-  set(WITH_CYCLES_NETWORK FALSE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
@@ -386,18 +392,12 @@ if(WITH_CYCLES_BLENDER)
   add_subdirectory(blender)
 endif()
 
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-
-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
-  add_subdirectory(app)
-endif()
-
+add_subdirectory(app)
 add_subdirectory(bvh)
 add_subdirectory(device)
 add_subdirectory(doc)
 add_subdirectory(graph)
+add_subdirectory(integrator)
 add_subdirectory(kernel)
 add_subdirectory(render)
 add_subdirectory(subd)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 7a1e5d62dd2..f9dc5f00802 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -91,24 +91,6 @@ if(WITH_CYCLES_STANDALONE)
 endif()
 
 #####################################################################
-# Cycles network server executable
-#####################################################################
-
-if(WITH_CYCLES_NETWORK)
-  set(SRC
-    cycles_server.cpp
-  )
-  add_executable(cycles_server ${SRC})
-  target_link_libraries(cycles_server ${LIBRARIES})
-  cycles_target_link_libraries(cycles_server)
-
-  if(UNIX AND NOT APPLE)
-    set_target_properties(cycles_server PROPERTIES INSTALL_RPATH $ORIGIN/lib)
-  endif()
-  unset(SRC)
-endif()
-
-#####################################################################
 # Cycles cubin compiler executable
 #####################################################################
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 6b3513b065a..270096d70b0 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -126,7 +126,7 @@ static BufferParams &session_buffer_params()
 
 static void scene_init()
 {
-  options.scene = new Scene(options.scene_params, options.session->device);
+  options.scene = options.session->scene;
 
   /* Read XML */
   xml_read_file(options.scene, options.filepath.c_str());
@@ -148,7 +148,7 @@ static void scene_init()
 static void session_init()
 {
   options.session_params.write_render_cb = write_render;
-  options.session = new Session(options.session_params);
+  options.session = new Session(options.session_params, options.scene_params);
 
   if (options.session_params.background && !options.quiet)
     options.session->progress.set_update_callback(function_bind(&session_print_status));
@@ -159,7 +159,6 @@ static void session_init()
 
   /* load scene */
   scene_init();
-  options.session->scene = options.scene;
 
   options.session->reset(session_buffer_params(), options.session_params.samples);
   options.session->start();
@@ -527,9 +526,6 @@ static void options_parse(int argc, const char **argv)
     fprintf(stderr, "No file path specified\n");
     exit(EXIT_FAILURE);
   }
-
-  /* For smoother Viewport */
-  options.session_params.start_resolution = 64;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 276d850f1b3..54f97fddbd9 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -703,7 +703,7 @@ void xml_read_file(Scene *scene, const char *filepath)
 
   xml_read_include(state, path_filename(filepath));
 
-  scene->params.bvh_type = SceneParams::BVH_STATIC;
+  scene->params.bvh_type = BVH_TYPE_STATIC;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index ee5c6157338..5bdcfd56a4d 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SRC
   blender_device.cpp
   blender_image.cpp
   blender_geometry.cpp
+  blender_gpu_display.cpp
   blender_light.cpp
   blender_mesh.cpp
   blender_object.cpp
@@ -50,6 +51,7 @@ set(SRC
 
   CCL_api.h
   blender_device.h
+  blender_gpu_display.h
   blender_id_map.h
   blender_image.h
   blender_object_cull.h
@@ -93,14 +95,6 @@ set(ADDON_FILES
 
 add_definitions(${GL_DEFINITIONS})
 
-if(WITH_CYCLES_DEVICE_OPENCL)
-  add_definitions(-DWITH_OPENCL)
-endif()
-
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-
 if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index f728050a3cf..1ce25a253f9 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,6 @@ class CyclesRender(bpy.types.RenderEngine):
     bl_use_eevee_viewport = True
     bl_use_preview = True
     bl_use_exclude_layers = True
-    bl_use_save_buffers = True
     bl_use_spherical_stereo = True
     bl_use_custom_freestyle = True
     bl_use_alembic_procedural = True
@@ -85,6 +84,12 @@ class CyclesRender(bpy.types.RenderEngine):
     def render(self, depsgraph):
         engine.render(self, depsgraph)
 
+    def render_frame_finish(self):
+        engine.render_frame_finish(self)
+
+    def draw(self, context, depsgraph):
+        engine.draw(self, depsgraph, context.space_data)
+
     def bake(self, depsgraph, obj, pass_type, pass_filter, width, height):
         engine.bake(self, depsgraph, obj, pass_type, pass_filter, width, height)
 
@@ -98,7 +103,7 @@ class CyclesRender(bpy.types.RenderEngine):
         engine.sync(self, depsgraph, context.blend_data)
 
     def view_draw(self, context, depsgraph):
-        engine.draw(self, depsgraph, context.region, context.space_data, context.region_data)
+        engine.view_draw(self, depsgraph, context.region, context.space_data, context.region_data)
 
     def update_script_node(self, node):
         if engine.with_osl():
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 489a883f098..e0e8ca10bef 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -18,62 +18,17 @@
 from __future__ import annotations
 
 
-def _is_using_buggy_driver():
-    import gpu
-    # We need to be conservative here because in multi-GPU systems display card
-    # might be quite old, but others one might be just good.
-    #
-    # So We shouldn't disable possible good dedicated cards just because display
-    # card seems weak. And instead we only blacklist configurations which are
-    # proven to cause problems.
-    if gpu.platform.vendor_get() == "ATI Technologies Inc.":
-        import re
-        version = gpu.platform.version_get()
-        if version.endswith("Compatibility Profile Context"):
-            # Old HD 4xxx and 5xxx series drivers did not have driver version
-            # in the version string, but those cards do not quite work and
-            # causing crashes.
-            return True
-        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$")
-        if not regex.match(version):
-            # Skip cards like FireGL
-            return False
-        version = regex.sub("\\1", version).split('.')
-        return int(version[0]) == 8
-    return False
-
-
-def _workaround_buggy_drivers():
-    if _is_using_buggy_driver():
-        import _cycles
-        if hasattr(_cycles, "opencl_disable"):
-            print("Cycles: OpenGL driver known to be buggy, disabling OpenCL platform.")
-            _cycles.opencl_disable()
-
-
 def _configure_argument_parser():
     import argparse
     # No help because it conflicts with general Python scripts argument parsing
     parser = argparse.ArgumentParser(description="Cycles Addon argument parser",
                                      add_help=False)
-    parser.add_argument("--cycles-resumable-num-chunks",
-                        help="Number of chunks to split sample range into",
-                        default=None)
-    parser.add_argument("--cycles-resumable-current-chunk",
-                        help="Current chunk of samples range to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-start-chunk",
-                        help="Start chunk to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-end-chunk",
-                        help="End chunk to render",
-                        default=None)
     parser.add_argument("--cycles-print-stats",
                         help="Print rendering statistics to stderr",
                         action='store_true')
     parser.add_argument("--cycles-device",
                         help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA', 'OPTIX' or 'OPENCL'."
+                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
                              "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                         default=None)
     return parser
@@ -89,21 +44,6 @@ def _parse_command_line():
     parser = _configure_argument_parser()
     args, _ = parser.parse_known_args(argv[argv.index("--") + 1:])
 
-    if args.cycles_resumable_num_chunks is not None:
-        if args.cycles_resumable_current_chunk is not None:
-            import _cycles
-            _cycles.set_resumable_chunk(
-                int(args.cycles_resumable_num_chunks),
-                int(args.cycles_resumable_current_chunk),
-            )
-        elif args.cycles_resumable_start_chunk is not None and \
-                args.cycles_resumable_end_chunk:
-            import _cycles
-            _cycles.set_resumable_chunk_range(
-                int(args.cycles_resumable_num_chunks),
-                int(args.cycles_resumable_start_chunk),
-                int(args.cycles_resumable_end_chunk),
-            )
     if args.cycles_print_stats:
         import _cycles
         _cycles.enable_print_stats()
@@ -118,23 +58,11 @@ def init():
     import _cycles
     import os.path
 
-    # Workaround possibly buggy legacy drivers which crashes on the OpenCL
-    # device enumeration.
-    #
-    # This checks are not really correct because they might still fail
-    # in the case of multiple GPUs. However, currently buggy drivers
-    # are really old and likely to be used in single GPU systems only
-    # anyway.
-    #
-    # Can't do it in the background mode, so we hope OpenCL is no enabled
-    # in the user preferences.
-    if not bpy.app.background:
-        _workaround_buggy_drivers()
-
     path = os.path.dirname(__file__)
     user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path='')))
+    temp_path = bpy.app.tempdir
 
-    _cycles.init(path, user_path, bpy.app.background)
+    _cycles.init(path, user_path, temp_path, bpy.app.background)
     _parse_command_line()
 
 
@@ -177,6 +105,25 @@ def render(engine, depsgraph):
         _cycles.render(engine.session, depsgraph.as_pointer())
 
 
+def render_frame_finish(engine):
+    if not engine.session:
+        return
+
+    import _cycles
+    _cycles.render_frame_finish(engine.session)
+
+def draw(engine, depsgraph, space_image):
+    if not engine.session:
+        return
+
+    depsgraph_ptr = depsgraph.as_pointer()
+    space_image_ptr = space_image.as_pointer()
+    screen_ptr = space_image.id_data.as_pointer()
+
+    import _cycles
+    _cycles.draw(engine.session, depsgraph_ptr, screen_ptr, space_image_ptr)
+
+
 def bake(engine, depsgraph, obj, pass_type, pass_filter, width, height):
     import _cycles
     session = getattr(engine, "session", None)
@@ -204,14 +151,14 @@ def sync(engine, depsgraph, data):
     _cycles.sync(engine.session, depsgraph.as_pointer())
 
 
-def draw(engine, depsgraph, region, v3d, rv3d):
+def view_draw(engine, depsgraph, region, v3d, rv3d):
     import _cycles
     depsgraph = depsgraph.as_pointer()
     v3d = v3d.as_pointer()
     rv3d = rv3d.as_pointer()
 
     # draw render image
-    _cycles.draw(engine.session, depsgraph, v3d, rv3d)
+    _cycles.view_draw(engine.session, depsgraph, v3d, rv3d)
 
 
 def available_devices():
@@ -224,11 +171,6 @@ def with_osl():
     return _cycles.with_osl
 
 
-def with_network():
-    import _cycles
-    return _cycles.with_network
-
-
 def system_info():
     import _cycles
     return _cycles.system_info()
@@ -243,6 +185,7 @@ def list_render_passes(scene, srl):
     # Data passes.
     if srl.use_pass_z:                     yield ("Depth",         "Z",    'VALUE')
     if srl.use_pass_mist:                  yield ("Mist",          "Z",    'VALUE')
+    if srl.use_pass_position:              yield ("Position",      "XYZ",  'VECTOR')
     if srl.use_pass_normal:                yield ("Normal",        "XYZ",  'VECTOR')
     if srl.use_pass_vector:                yield ("Vector",        "XYZW", 'VECTOR')
     if srl.use_pass_uv:                    yield ("UV",            "UVA",  'VECTOR')
@@ -265,6 +208,7 @@ def list_render_passes(scene, srl):
     if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
     if srl.use_pass_shadow:                yield ("Shadow",        "RGB",  'COLOR')
     if srl.use_pass_ambient_occlusion:     yield ("AO",            "RGB",  'COLOR')
+    if crl.use_pass_shadow_catcher:        yield ("Shadow Catcher",      "RGB",  'COLOR')
 
     # Debug passes.
     if crl.pass_debug_render_time:             yield ("Debug Render Time",             "X",   'VALUE')
@@ -283,30 +227,20 @@ def list_render_passes(scene, srl):
             yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
 
     # Denoising passes.
-    if (scene.cycles.use_denoising and crl.use_denoising) or crl.denoising_store_passes:
+    if scene.cycles.use_denoising and crl.use_denoising:
         yield ("Noisy Image", "RGBA", 'COLOR')
-        if crl.denoising_store_passes:
-            yield ("Denoising Normal",          "XYZ", 'VECTOR')
-            yield ("Denoising Albedo",          "RGB", 'COLOR')
-            yield ("Denoising Depth",           "Z",   'VALUE')
-
-            if scene.cycles.denoiser == 'NLM':
-                yield ("Denoising Shadowing",       "X",   'VALUE')
-                yield ("Denoising Variance",        "RGB", 'COLOR')
-                yield ("Denoising Intensity",       "X",   'VALUE')
-
-                clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
-                                 "denoising_glossy_direct", "denoising_glossy_indirect",
-                                 "denoising_transmission_direct", "denoising_transmission_indirect")
-                if any(getattr(crl, option) for option in clean_options):
-                    yield ("Denoising Clean", "RGB", 'COLOR')
+        if crl.use_pass_shadow_catcher:
+            yield ("Noisy Shadow Catcher", "RGBA", 'COLOR')
+    if crl.denoising_store_passes:
+        yield ("Denoising Normal",          "XYZ", 'VECTOR')
+        yield ("Denoising Albedo",          "RGB", 'COLOR')
 
     # Custom AOV passes.
     for aov in srl.aovs:
         if aov.type == 'VALUE':
             yield (aov.name, "X", 'VALUE')
         else:
-            yield (aov.name, "RGBA", 'COLOR')
+            yield (aov.name, "RGB", 'COLOR')
 
 
 def register_passes(engine, scene, view_layer):
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index bf33e5dc010..37c39904e30 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -60,32 +60,48 @@ class AddPresetSampling(AddPresetBase, Operator):
     ]
 
     preset_values = [
+        "cycles.use_adaptive_sampling",
         "cycles.samples",
-        "cycles.preview_samples",
-        "cycles.aa_samples",
-        "cycles.preview_aa_samples",
-        "cycles.diffuse_samples",
-        "cycles.glossy_samples",
-        "cycles.transmission_samples",
-        "cycles.ao_samples",
-        "cycles.mesh_light_samples",
-        "cycles.subsurface_samples",
-        "cycles.volume_samples",
-        "cycles.use_square_samples",
-        "cycles.progressive",
-        "cycles.seed",
-        "cycles.sample_clamp_direct",
-        "cycles.sample_clamp_indirect",
-        "cycles.sample_all_lights_direct",
-        "cycles.sample_all_lights_indirect",
+        "cycles.adaptive_threshold",
+        "cycles.adaptive_min_samples",
+        "cycles.time_limit",
+        "cycles.use_denoising",
+        "cycles.denoiser",
+        "cycles.denoising_input_passes",
+        "cycles.denoising_prefilter",
     ]
 
     preset_subdir = "cycles/sampling"
 
 
+class AddPresetViewportSampling(AddPresetBase, Operator):
+    '''Add a Viewport Sampling Preset'''
+    bl_idname = "render.cycles_viewport_sampling_preset_add"
+    bl_label = "Add Viewport Sampling Preset"
+    preset_menu = "CYCLES_PT_viewport_sampling_presets"
+
+    preset_defines = [
+        "cycles = bpy.context.scene.cycles"
+    ]
+
+    preset_values = [
+        "cycles.use_preview_adaptive_sampling",
+        "cycles.preview_samples",
+        "cycles.preview_adaptive_threshold",
+        "cycles.preview_adaptive_min_samples",
+        "cycles.use_preview_denoising",
+        "cycles.preview_denoiser",
+        "cycles.preview_denoising_input_passes",
+        "cycles.preview_denoising_prefilter",
+        "cycles.preview_denoising_start_sample",
+    ]
+
+    preset_subdir = "cycles/viewport_sampling"
+
 classes = (
     AddPresetIntegrator,
     AddPresetSampling,
+    AddPresetViewportSampling,
 )
 
 
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0c3af3fabeb..c2570e71efd 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -39,11 +39,6 @@ enum_devices = (
     ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
 )
 
-from _cycles import with_network
-if with_network:
-    enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),)
-del with_network
-
 enum_feature_set = (
     ('SUPPORTED', "Supported", "Only use finished and supported features"),
     ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
@@ -84,15 +79,6 @@ enum_curve_shape = (
     ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"),
 )
 
-enum_tile_order = (
-    ('CENTER', "Center", "Render from center to the edges"),
-    ('RIGHT_TO_LEFT', "Right to Left", "Render from right to left"),
-    ('LEFT_TO_RIGHT', "Left to Right", "Render from left to right"),
-    ('TOP_TO_BOTTOM', "Top to Bottom", "Render from top to bottom"),
-    ('BOTTOM_TO_TOP', "Bottom to Top", "Render from bottom to top"),
-    ('HILBERT_SPIRAL', "Hilbert Spiral", "Render in a Hilbert Spiral"),
-)
-
 enum_use_layer_samples = (
     ('USE', "Use", "Per render layer number of samples override scene samples"),
     ('BOUNDED', "Bounded", "Bound per render layer number of samples by global samples"),
@@ -101,15 +87,9 @@ enum_use_layer_samples = (
 
 enum_sampling_pattern = (
     ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
-    ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
     ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
 )
 
-enum_integrator = (
-    ('BRANCHED_PATH', "Branched Path Tracing", "Path tracing integrator that branches on the first bounce, giving more control over the number of light and material samples"),
-    ('PATH', "Path Tracing", "Pure path tracing integrator"),
-)
-
 enum_volume_sampling = (
     ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"),
     ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
@@ -131,7 +111,6 @@ enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
     ('OPTIX', "OptiX", "OptiX", 3),
-    ('OPENCL', "OpenCL", "OpenCL", 2)
 )
 
 enum_texture_limit = (
@@ -144,39 +123,46 @@ enum_texture_limit = (
     ('4096', "4096", "Limit texture size to 4096 pixels", 6),
     ('8192', "8192", "Limit texture size to 8192 pixels", 7),
 )
-
+ 
+# NOTE: Identifiers are expected to be an upper case version of identifiers from  `Pass::get_type_enum()`
 enum_view3d_shading_render_pass = (
     ('', "General", ""),
 
-    ('COMBINED', "Combined", "Show the Combined Render pass", 1),
-    ('EMISSION', "Emission", "Show the Emission render pass", 33),
-    ('BACKGROUND', "Background", "Show the Background render pass", 34),
-    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass", 35),
+    ('COMBINED', "Combined", "Show the Combined Render pass"),
+    ('EMISSION', "Emission", "Show the Emission render pass"),
+    ('BACKGROUND', "Background", "Show the Background render pass"),
+    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass"),
+    ('SHADOW', "Shadow", "Show the Shadow render pass"),
+    ('SHADOW_CATCHER', "Shadow Catcher", "Show the Shadow Catcher render pass"),
 
     ('', "Light", ""),
 
-    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass", 38),
-    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass", 39),
-    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass", 40),
+    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass"),
+    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass"),
+    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass"),
 
-    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass", 41),
-    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass", 42),
-    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass", 43),
+    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass"),
+    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass"),
+    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass"),
 
     ('', "", ""),
 
-    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass", 44),
-    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45),
-    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46),
+    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass"),
+    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass"),
+    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass"),
 
-    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50),
-    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51),
+    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass"),
+    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass"),
 
     ('', "Data", ""),
 
-    ('NORMAL', "Normal", "Show the Normal render pass", 3),
-    ('UV', "UV", "Show the UV render pass", 4),
-    ('MIST', "Mist", "Show the Mist render pass", 32),
+    ('POSITION', "Position", "Show the Position render pass"),
+    ('NORMAL', "Normal", "Show the Normal render pass"),
+    ('UV', "UV", "Show the UV render pass"),
+    ('MIST', "Mist", "Show the Mist render pass"),
+    ('DENOISING_ALBEDO', "Denoising Albedo", "Albedo pass used by denoiser"),
+    ('DENOISING_NORMAL', "Denoising Normal", "Normal pass used by denoiser"),
+    ('SAMPLE_COUNT', "Sample Count", "Per-pixel number of samples"),
 )
 
 
@@ -208,18 +194,23 @@ def enum_preview_denoiser(self, context):
 
 
 def enum_denoiser(self, context):
-    items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
+    items = []
     items += enum_optix_denoiser(self, context)
     items += enum_openimagedenoise_denoiser(self, context)
     return items
 
 
 enum_denoising_input_passes = (
-    ('RGB', "Color", "Use only color as input", 1),
-    ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
-    ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
+    ('RGB', "None", "Don't use utility passes for denoising", 1),
+    ('RGB_ALBEDO', "Albedo", "Use albedo pass for denoising", 2),
+    ('RGB_ALBEDO_NORMAL', "Albedo and Normal", "Use albedo and normal passes for denoising", 3),
 )
 
+enum_denoising_prefilter = (
+    ('NONE', "None", "No prefiltering, use when guiding passes are noise-free", 1),
+    ('FAST', "Fast", "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time", 2),
+    ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
+)
 
 def update_render_passes(self, context):
     scene = context.scene
@@ -252,13 +243,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         description="Use Open Shading Language (CPU rendering only)",
     )
 
-    progressive: EnumProperty(
-        name="Integrator",
-        description="Method to sample lights and materials",
-        items=enum_integrator,
-        default='PATH',
-    )
-
     preview_pause: BoolProperty(
         name="Pause Preview",
         description="Pause all viewport preview renders",
@@ -268,110 +252,88 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
-        default=False,
+        default=True,
         update=update_render_passes,
     )
-    use_preview_denoising: BoolProperty(
-        name="Use Viewport Denoising",
-        description="Denoise the image in the 3D viewport",
-        default=False,
-    )
-
     denoiser: EnumProperty(
         name="Denoiser",
         description="Denoise the image with the selected denoiser. "
-        "For denoising the image after rendering, denoising data render passes "
-        "also adapt to the selected denoiser",
+        "For denoising the image after rendering",
         items=enum_denoiser,
-        default=1,
+        default=4, # Use integer to avoid error in builds without OpenImageDenoise.
         update=update_render_passes,
     )
+    denoising_prefilter: EnumProperty(
+        name="Denoising Prefilter",
+        description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+        items=enum_denoising_prefilter,
+        default='ACCURATE',
+    )
+    denoising_input_passes: EnumProperty(
+        name="Denoising Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO_NORMAL',
+    )
+
+    use_preview_denoising: BoolProperty(
+        name="Use Viewport Denoising",
+        description="Denoise the image in the 3D viewport",
+        default=False,
+    )
     preview_denoiser: EnumProperty(
         name="Viewport Denoiser",
         description="Denoise the image after each preview update with the selected denoiser",
         items=enum_preview_denoiser,
         default=0,
     )
-
-    use_square_samples: BoolProperty(
-        name="Square Samples",
-        description="Square sampling values for easier artist control",
-        default=False,
+    preview_denoising_prefilter: EnumProperty(
+        name="Viewport Denoising Prefilter",
+        description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+        items=enum_denoising_prefilter,
+        default='FAST',
+    )
+    preview_denoising_input_passes: EnumProperty(
+        name="Viewport Denoising Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO',
+    )
+    preview_denoising_start_sample: IntProperty(
+        name="Start Denoising",
+        description="Sample to start denoising the preview at",
+        min=0, max=(1 << 24),
+        default=1,
     )
 
     samples: IntProperty(
         name="Samples",
         description="Number of samples to render for each pixel",
         min=1, max=(1 << 24),
-        default=128,
+        default=4096,
     )
     preview_samples: IntProperty(
         name="Viewport Samples",
         description="Number of samples to render in the viewport, unlimited if 0",
         min=0, max=(1 << 24),
-        default=32,
-    )
-    aa_samples: IntProperty(
-        name="AA Samples",
-        description="Number of antialiasing samples to render for each pixel",
-        min=1, max=2097151,
-        default=128,
-    )
-    preview_aa_samples: IntProperty(
-        name="AA Samples",
-        description="Number of antialiasing samples to render in the viewport, unlimited if 0",
-        min=0, max=2097151,
-        default=32,
+        default=1024,
     )
 
-    diffuse_samples: IntProperty(
-        name="Diffuse Samples",
-        description="Number of diffuse bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    glossy_samples: IntProperty(
-        name="Glossy Samples",
-        description="Number of glossy bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    transmission_samples: IntProperty(
-        name="Transmission Samples",
-        description="Number of transmission bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    ao_samples: IntProperty(
-        name="Ambient Occlusion Samples",
-        description="Number of ambient occlusion samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    mesh_light_samples: IntProperty(
-        name="Mesh Light Samples",
-        description="Number of mesh emission light samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    subsurface_samples: IntProperty(
-        name="Subsurface Samples",
-        description="Number of subsurface scattering samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    volume_samples: IntProperty(
-        name="Volume Samples",
-        description="Number of volume scattering samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
+    time_limit: FloatProperty(
+        name="Time Limit",
+        description="Limit the render time (excluding synchronization time)."
+        "Zero disables the limit",
+        min=0.0,
+        default=0.0,
+        step=100.0,
+        unit='TIME_ABSOLUTE',
     )
 
     sampling_pattern: EnumProperty(
         name="Sampling Pattern",
         description="Random sampling pattern used by the integrator",
         items=enum_sampling_pattern,
-        default='SOBOL',
+        default='PROGRESSIVE_MUTI_JITTER',
     )
 
     use_layer_samples: EnumProperty(
@@ -381,17 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default='USE',
     )
 
-    sample_all_lights_direct: BoolProperty(
-        name="Sample All Direct Lights",
-        description="Sample all lights (for direct samples), rather than randomly picking one",
-        default=True,
-    )
-
-    sample_all_lights_indirect: BoolProperty(
-        name="Sample All Indirect Lights",
-        description="Sample all lights (for indirect samples), rather than randomly picking one",
-        default=True,
-    )
     light_sampling_threshold: FloatProperty(
         name="Light Sampling Threshold",
         description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
@@ -403,19 +354,39 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     use_adaptive_sampling: BoolProperty(
         name="Use Adaptive Sampling",
         description="Automatically reduce the number of samples per pixel based on estimated noise level",
-        default=False,
+        default=True,
     )
-
     adaptive_threshold: FloatProperty(
         name="Adaptive Sampling Threshold",
         description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples",
         min=0.0, max=1.0,
-        default=0.0,
+        soft_min=0.001,
+        default=0.01,
         precision=4,
     )
     adaptive_min_samples: IntProperty(
         name="Adaptive Min Samples",
-        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold",
+        min=0, max=4096,
+        default=0,
+    )
+
+    use_preview_adaptive_sampling: BoolProperty(
+        name="Use Adaptive Sampling",
+        description="Automatically reduce the number of samples per pixel based on estimated noise level, for viewport renders",
+        default=True,
+    )
+    preview_adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples, for viewport renders",
+        min=0.0, max=1.0,
+        soft_min=0.001,
+        default=0.1,
+        precision=4,
+    )
+    preview_adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold, for viewport renders",
         min=0, max=4096,
         default=0,
     )
@@ -632,53 +603,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=10.0,
     )
 
-    debug_tile_size: IntProperty(
-        name="Tile Size",
-        description="",
-        min=1, max=4096,
-        default=1024,
-    )
-
-    preview_start_resolution: IntProperty(
-        name="Start Resolution",
-        description="Resolution to start rendering preview at, "
-        "progressively increasing it to the full viewport size",
-        min=8, max=16384,
-        default=64,
-        subtype='PIXEL'
-    )
-    preview_denoising_start_sample: IntProperty(
-        name="Start Denoising",
-        description="Sample to start denoising the preview at",
-        min=0, max=(1 << 24),
-        default=1,
-    )
-    preview_denoising_input_passes: EnumProperty(
-        name="Viewport Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO',
-    )
-
-    debug_reset_timeout: FloatProperty(
-        name="Reset timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=0.1,
-    )
-    debug_cancel_timeout: FloatProperty(
-        name="Cancel timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=0.1,
-    )
-    debug_text_timeout: FloatProperty(
-        name="Text timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=1.0,
-    )
-
     debug_bvh_type: EnumProperty(
         name="Viewport BVH Type",
         description="Choose between faster updates, or faster render",
@@ -701,38 +625,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0,
         min=0, max=16,
     )
-    tile_order: EnumProperty(
-        name="Tile Order",
-        description="Tile order for rendering",
-        items=enum_tile_order,
-        default='HILBERT_SPIRAL',
-        options=set(),  # Not animatable!
-    )
-    use_progressive_refine: BoolProperty(
-        name="Progressive Refine",
-        description="Instead of rendering each tile until it is finished, "
-        "refine the whole image progressively "
-        "(this renders somewhat slower, "
-        "but time can be saved by manually stopping the render when the noise is low enough)",
-        default=False,
-    )
 
     bake_type: EnumProperty(
         name="Bake Type",
         default='COMBINED',
         description="Type of pass to bake",
         items=(
-            ('COMBINED', "Combined", ""),
-            ('AO', "Ambient Occlusion", ""),
-            ('SHADOW', "Shadow", ""),
-            ('NORMAL', "Normal", ""),
-            ('UV', "UV", ""),
-            ('ROUGHNESS', "Roughness", ""),
-            ('EMIT', "Emit", ""),
-            ('ENVIRONMENT', "Environment", ""),
-            ('DIFFUSE', "Diffuse", ""),
-            ('GLOSSY', "Glossy", ""),
-            ('TRANSMISSION', "Transmission", ""),
+            ('COMBINED', "Combined", "", 0),
+            ('AO', "Ambient Occlusion", "", 1),
+            ('SHADOW', "Shadow", "", 2),
+            ('POSITION', "Position", "", 11),
+            ('NORMAL', "Normal", "", 3),
+            ('UV', "UV", "", 4),
+            ('ROUGHNESS', "Roughness", "", 5),
+            ('EMIT', "Emit", "", 6),
+            ('ENVIRONMENT', "Environment", "", 7),
+            ('DIFFUSE', "Diffuse", "", 8),
+            ('GLOSSY', "Glossy", "", 9),
+            ('TRANSMISSION', "Transmission", "", 10),
         ),
     )
 
@@ -827,6 +737,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0, max=1024,
     )
 
+    use_auto_tile: BoolProperty(
+        name="Auto Tiles",
+        description="Automatically split image into tiles",
+        default=True,
+    )
+    tile_size: IntProperty(
+        name="Tile Size",
+        default=2048,
+        description="",
+        min=0, max=16384,
+    )
+
     # Various fine-tuning debug flags
 
     def _devices_update_callback(self, context):
@@ -844,45 +766,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         items=enum_bvh_layouts,
         default='EMBREE',
     )
-    debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
     debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
-    debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
-
-    debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
-    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
-
-    debug_opencl_kernel_type: EnumProperty(
-        name="OpenCL Kernel Type",
-        default='DEFAULT',
-        items=(
-            ('DEFAULT', "Default", ""),
-            ('MEGA', "Mega", ""),
-            ('SPLIT', "Split", ""),
-        ),
-        update=CyclesRenderSettings._devices_update_callback
-    )
 
-    debug_opencl_device_type: EnumProperty(
-        name="OpenCL Device Type",
-        default='ALL',
-        items=(
-            ('NONE', "None", ""),
-            ('ALL', "All", ""),
-            ('DEFAULT', "Default", ""),
-            ('CPU', "CPU", ""),
-            ('GPU', "GPU", ""),
-            ('ACCELERATOR', "Accelerator", ""),
-        ),
-        update=CyclesRenderSettings._devices_update_callback
-    )
-
-    debug_use_opencl_debug: BoolProperty(name="Debug OpenCL", default=False)
-
-    debug_opencl_mem_limit: IntProperty(
-        name="Memory limit",
-        default=0,
-        description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)"
+    debug_use_optix_debug: BoolProperty(
+        name="OptiX Module Debug",
+        description="Load OptiX module in debug mode: lower logging verbosity level, enable validations, and lower optimization level",
+        default=False
     )
 
     @classmethod
@@ -1031,12 +921,6 @@ class CyclesLightSettings(bpy.types.PropertyGroup):
         description="Light casts shadows",
         default=True,
     )
-    samples: IntProperty(
-        name="Samples",
-        description="Number of light samples to render for each AA sample",
-        min=1, max=10000,
-        default=1,
-    )
     max_bounces: IntProperty(
         name="Max Bounces",
         description="Maximum number of bounces the light will contribute to the render",
@@ -1084,12 +968,6 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
         min=4, max=8192,
         default=1024,
     )
-    samples: IntProperty(
-        name="Samples",
-        description="Number of light samples to render for each AA sample",
-        min=1, max=10000,
-        default=1,
-    )
     max_bounces: IntProperty(
         name="Max Bounces",
         description="Maximum number of bounces the background light will contribute to the render",
@@ -1343,91 +1221,25 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         update=update_render_passes,
     )
 
+    use_pass_shadow_catcher: BoolProperty(
+        name="Shadow Catcher",
+        description="Pass containing shadows and light which is to be multiplied into backdrop",
+        default=False,
+        update=update_render_passes,
+    )
+
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
         default=True,
         update=update_render_passes,
     )
-    denoising_diffuse_direct: BoolProperty(
-        name="Diffuse Direct",
-        description="Denoise the direct diffuse lighting",
-        default=True,
-    )
-    denoising_diffuse_indirect: BoolProperty(
-        name="Diffuse Indirect",
-        description="Denoise the indirect diffuse lighting",
-        default=True,
-    )
-    denoising_glossy_direct: BoolProperty(
-        name="Glossy Direct",
-        description="Denoise the direct glossy lighting",
-        default=True,
-    )
-    denoising_glossy_indirect: BoolProperty(
-        name="Glossy Indirect",
-        description="Denoise the indirect glossy lighting",
-        default=True,
-    )
-    denoising_transmission_direct: BoolProperty(
-        name="Transmission Direct",
-        description="Denoise the direct transmission lighting",
-        default=True,
-    )
-    denoising_transmission_indirect: BoolProperty(
-        name="Transmission Indirect",
-        description="Denoise the indirect transmission lighting",
-        default=True,
-    )
-    denoising_strength: FloatProperty(
-        name="Denoising Strength",
-        description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
-        min=0.0, max=1.0,
-        default=0.5,
-    )
-    denoising_feature_strength: FloatProperty(
-        name="Denoising Feature Strength",
-        description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
-        min=0.0, max=1.0,
-        default=0.5,
-    )
-    denoising_radius: IntProperty(
-        name="Denoising Radius",
-        description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
-        min=1, max=25,
-        default=8,
-        subtype="PIXEL",
-    )
-    denoising_relative_pca: BoolProperty(
-        name="Relative Filter",
-        description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
-        default=False,
-    )
     denoising_store_passes: BoolProperty(
         name="Store Denoising Passes",
         description="Store the denoising feature passes and the noisy image. The passes adapt to the denoiser selected for rendering",
         default=False,
         update=update_render_passes,
     )
-    denoising_neighbor_frames: IntProperty(
-        name="Neighbor Frames",
-        description="Number of neighboring frames to use for denoising animations (more frames produce smoother results at the cost of performance)",
-        min=0, max=7,
-        default=0,
-    )
-
-    denoising_optix_input_passes: EnumProperty(
-        name="Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO',
-    )
-    denoising_openimagedenoise_input_passes: EnumProperty(
-        name="Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO_NORMAL',
-    )
 
     @classmethod
     def register(cls):
@@ -1454,14 +1266,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_optix, has_opencl = _cycles.get_device_types()
+        has_cuda, has_optix = _cycles.get_device_types()
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
         if has_optix:
             list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
-        if has_opencl:
-            list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
         return list
 
     compute_device_type: EnumProperty(
@@ -1486,7 +1296,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1520,22 +1330,23 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        if compute_device_type in {'CUDA', 'OPTIX', 'OPENCL'}:
+        if compute_device_type != 'CPU':
             devices.extend(cpu_devices)
         return devices
 
-    # For backwards compatibility, only returns CUDA and OpenCL but still
-    # refreshes all devices.
-    def get_devices(self, compute_device_type=''):
+    # Refresh device list. This does not happen automatically on Blender
+    # startup due to unstable OpenCL implementations that can cause crashes.
+    def refresh_devices(self):
         import _cycles
         # Ensure `self.devices` is not re-allocated when the second call to
         # get_devices_for_type is made, freeing items from the first list.
         for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
             self.update_device_entries(_cycles.available_devices(device_type))
 
-        cuda_devices = self.get_devices_for_type('CUDA')
-        opencl_devices = self.get_devices_for_type('OPENCL')
-        return cuda_devices, opencl_devices
+    # Deprecated: use refresh_devices instead.
+    def get_devices(self, compute_device_type=''):
+        self.refresh_devices()
+        return None
 
     def get_num_gpu_devices(self):
         import _cycles
@@ -1601,6 +1412,10 @@ class CyclesView3DShadingSettings(bpy.types.PropertyGroup):
         items=enum_view3d_shading_render_pass,
         default='COMBINED',
     )
+    show_active_pixels: BoolProperty(
+        name="Show Active Pixels",
+        description="When using adaptive sampling highlight pixels which are being sampled",
+    )
 
 
 def register():
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 47f7b4c6d73..d02627b9936 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -34,6 +34,12 @@ class CYCLES_PT_sampling_presets(PresetPanel, Panel):
     preset_add_operator = "render.cycles_sampling_preset_add"
     COMPAT_ENGINES = {'CYCLES'}
 
+class CYCLES_PT_viewport_sampling_presets(PresetPanel, Panel):
+    bl_label = "Viewport Sampling Presets"
+    preset_subdir = "cycles/viewport_sampling"
+    preset_operator = "script.execute_preset"
+    preset_add_operator = "render.cycles_viewport_sampling_preset_add"
+    COMPAT_ENGINES = {'CYCLES'}
 
 class CYCLES_PT_integrator_presets(PresetPanel, Panel):
     bl_label = "Integrator Presets"
@@ -54,6 +60,15 @@ class CyclesButtonsPanel:
         return context.engine in cls.COMPAT_ENGINES
 
 
+class CyclesDebugButtonsPanel(CyclesButtonsPanel):
+    @classmethod
+    def poll(cls, context):
+        prefs = bpy.context.preferences
+        return (CyclesButtonsPanel.poll(context)
+                and prefs.experimental.use_cycles_debug
+                and prefs.view.show_developer_ui)
+
+
 # Adapt properties editor panel to display in node editor. We have to
 # copy the class rather than inherit due to the way bpy registration works.
 def node_panel(cls):
@@ -78,12 +93,6 @@ def use_cpu(context):
     return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
 
 
-def use_opencl(context):
-    cscene = context.scene.cycles
-
-    return (get_device_type(context) == 'OPENCL' and cscene.device == 'GPU')
-
-
 def use_cuda(context):
     cscene = context.scene.cycles
 
@@ -96,12 +105,6 @@ def use_optix(context):
     return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
 
 
-def use_branched_path(context):
-    cscene = context.scene.cycles
-
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
-
-
 def use_sample_all_lights(context):
     cscene = context.scene.cycles
 
@@ -115,57 +118,33 @@ def show_device_active(context):
     return context.preferences.addons[__package__].preferences.has_active_device()
 
 
-def draw_samples_info(layout, context):
-    cscene = context.scene.cycles
-    integrator = cscene.progressive
+def get_effective_preview_denoiser(context):
+    scene = context.scene
+    cscene = scene.cycles
+
+    if cscene.preview_denoiser != "AUTO":
+        return cscene.preview_denoiser
+
+    if context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'):
+        return 'OPTIX'
+
+    return 'OIDN'
 
-    # Calculate sample values
-    if integrator == 'PATH':
-        aa = cscene.samples
-        if cscene.use_square_samples:
-            aa = aa * aa
-    else:
-        aa = cscene.aa_samples
-        d = cscene.diffuse_samples
-        g = cscene.glossy_samples
-        t = cscene.transmission_samples
-        ao = cscene.ao_samples
-        ml = cscene.mesh_light_samples
-        sss = cscene.subsurface_samples
-        vol = cscene.volume_samples
-
-        if cscene.use_square_samples:
-            aa = aa * aa
-            d = d * d
-            g = g * g
-            t = t * t
-            ao = ao * ao
-            ml = ml * ml
-            sss = sss * sss
-            vol = vol * vol
-
-    # Draw interface
-    # Do not draw for progressive, when Square Samples are disabled
-    if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'):
-        col = layout.column(align=True)
-        col.scale_y = 0.6
-        col.label(text="Total Samples:")
-        col.separator()
-        if integrator == 'PATH':
-            col.label(text="%s AA" % aa)
-        else:
-            col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
-                      (aa, d * aa, g * aa, t * aa))
-            col.separator()
-            col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
-                      (ao * aa, ml * aa, sss * aa, vol * aa))
 
 
 class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
     bl_label = "Sampling"
 
+    def draw(self, context):
+        pass
+
+
+class CYCLES_RENDER_PT_sampling_viewport(CyclesButtonsPanel, Panel):
+    bl_label = "Viewport"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+
     def draw_header_preset(self, context):
-        CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
+        CYCLES_PT_viewport_sampling_presets.draw_panel_header(self.layout)
 
     def draw(self, context):
         layout = self.layout
@@ -176,29 +155,31 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
-        if not use_optix(context):
-            layout.prop(cscene, "progressive")
+        heading = layout.column(align=True, heading="Noise Threshold")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_preview_adaptive_sampling", text="")
+        sub = row.row()
+        sub.active = cscene.use_preview_adaptive_sampling
+        sub.prop(cscene, "preview_adaptive_threshold", text="")
 
-        if not use_branched_path(context):
+        if cscene.use_preview_adaptive_sampling:
             col = layout.column(align=True)
-            col.prop(cscene, "samples", text="Render")
-            col.prop(cscene, "preview_samples", text="Viewport")
+            col.prop(cscene, "preview_samples", text=" Max Samples")
+            col.prop(cscene, "preview_adaptive_min_samples", text="Min Samples")
         else:
-            col = layout.column(align=True)
-            col.prop(cscene, "aa_samples", text="Render")
-            col.prop(cscene, "preview_aa_samples", text="Viewport")
+            layout.prop(cscene, "preview_samples", text="Samples")
 
-        if not use_branched_path(context):
-            draw_samples_info(layout, context)
 
+class CYCLES_RENDER_PT_sampling_viewport_denoise(CyclesButtonsPanel, Panel):
+    bl_label = "Denoise"
+    bl_parent_id = 'CYCLES_RENDER_PT_sampling_viewport'
+    bl_options = {'DEFAULT_CLOSED'}
 
-class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
-    bl_label = "Sub Samples"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    def draw_header(self, context):
+        scene = context.scene
+        cscene = scene.cycles
 
-    @classmethod
-    def poll(cls, context):
-        return use_branched_path(context)
+        self.layout.prop(context.scene.cycles, "use_preview_denoising", text="")
 
     def draw(self, context):
         layout = self.layout
@@ -208,53 +189,61 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        col = layout.column(align=True)
-        col.prop(cscene, "diffuse_samples", text="Diffuse")
-        col.prop(cscene, "glossy_samples", text="Glossy")
-        col.prop(cscene, "transmission_samples", text="Transmission")
-        col.prop(cscene, "ao_samples", text="AO")
+        col = layout.column()
+        col.active = cscene.use_preview_denoising
+        col.prop(cscene, "preview_denoiser", text="Denoiser")
+        col.prop(cscene, "preview_denoising_input_passes", text="Passes")
 
-        sub = col.row(align=True)
-        sub.active = use_sample_all_lights(context)
-        sub.prop(cscene, "mesh_light_samples", text="Mesh Light")
-        col.prop(cscene, "subsurface_samples", text="Subsurface")
-        col.prop(cscene, "volume_samples", text="Volume")
+        effective_preview_denoiser = get_effective_preview_denoiser(context)
+        if effective_preview_denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cscene, "preview_denoising_prefilter", text="Prefilter")
 
-        draw_samples_info(layout, context)
+        col.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
 
 
-class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
-    bl_label = "Adaptive Sampling"
+class CYCLES_RENDER_PT_sampling_render(CyclesButtonsPanel, Panel):
+    bl_label = "Render"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
-    bl_options = {'DEFAULT_CLOSED'}
 
-    def draw_header(self, context):
-        layout = self.layout
-        scene = context.scene
-        cscene = scene.cycles
-
-        layout.prop(cscene, "use_adaptive_sampling", text="")
+    def draw_header_preset(self, context):
+        CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
 
     def draw(self, context):
         layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
 
         scene = context.scene
         cscene = scene.cycles
 
-        layout.active = cscene.use_adaptive_sampling
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        heading = layout.column(align=True, heading="Noise Threshold")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_adaptive_sampling", text="")
+        sub = row.row()
+        sub.active = cscene.use_adaptive_sampling
+        sub.prop(cscene, "adaptive_threshold", text="")
 
         col = layout.column(align=True)
-        col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
-        col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        if cscene.use_adaptive_sampling:
+            col.prop(cscene, "samples", text=" Max Samples")
+            col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        else:
+            col.prop(cscene, "samples", text="Samples")
+        col.prop(cscene, "time_limit")
 
 
-class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
-    bl_label = "Denoising"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+class CYCLES_RENDER_PT_sampling_render_denoise(CyclesButtonsPanel, Panel):
+    bl_label = "Denoise"
+    bl_parent_id = 'CYCLES_RENDER_PT_sampling_render'
     bl_options = {'DEFAULT_CLOSED'}
 
+    def draw_header(self, context):
+        scene = context.scene
+        cscene = scene.cycles
+
+        self.layout.prop(context.scene.cycles, "use_denoising", text="")
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -263,33 +252,12 @@ class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        heading = layout.column(align=True, heading="Render")
-        row = heading.row(align=True)
-        row.prop(cscene, "use_denoising", text="")
-        sub = row.row()
-
-        sub.active = cscene.use_denoising
-        for view_layer in scene.view_layers:
-            if view_layer.cycles.denoising_store_passes:
-                sub.active = True
-
-        sub.prop(cscene, "denoiser", text="")
-
-        layout.separator()
-
-        heading = layout.column(align=False, heading="Viewport")
-        row = heading.row(align=True)
-        row.prop(cscene, "use_preview_denoising", text="")
-        sub = row.row()
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoiser", text="")
-
-        sub = heading.row(align=True)
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
-        sub = heading.row(align=True)
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoising_input_passes", text="Input Passes")
+        col = layout.column()
+        col.active = cscene.use_denoising
+        col.prop(cscene, "denoiser", text="Denoiser")
+        col.prop(cscene, "denoising_input_passes", text="Passes")
+        if cscene.denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cscene, "denoising_prefilter", text="Prefilter")
 
 
 class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
@@ -313,8 +281,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.active = not(cscene.use_adaptive_sampling)
         col.prop(cscene, "sampling_pattern", text="Pattern")
 
-        layout.prop(cscene, "use_square_samples")
-
         layout.separator()
 
         col = layout.column(align=True)
@@ -322,11 +288,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.prop(cscene, "min_transparent_bounces")
         col.prop(cscene, "light_sampling_threshold", text="Light Threshold")
 
-        if cscene.progressive != 'PATH' and use_branched_path(context):
-            col = layout.column(align=True)
-            col.prop(cscene, "sample_all_lights_direct")
-            col.prop(cscene, "sample_all_lights_indirect")
-
         for view_layer in scene.view_layers:
             if view_layer.samples > 0:
                 layout.separator()
@@ -334,62 +295,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
                 break
 
 
-class CYCLES_RENDER_PT_sampling_total(CyclesButtonsPanel, Panel):
-    bl_label = "Total Samples"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
-
-    @classmethod
-    def poll(cls, context):
-        scene = context.scene
-        cscene = scene.cycles
-
-        if cscene.use_square_samples:
-            return True
-
-        return cscene.progressive != 'PATH' and use_branched_path(context)
-
-    def draw(self, context):
-        layout = self.layout
-        cscene = context.scene.cycles
-        integrator = cscene.progressive
-
-        # Calculate sample values
-        if integrator == 'PATH':
-            aa = cscene.samples
-            if cscene.use_square_samples:
-                aa = aa * aa
-        else:
-            aa = cscene.aa_samples
-            d = cscene.diffuse_samples
-            g = cscene.glossy_samples
-            t = cscene.transmission_samples
-            ao = cscene.ao_samples
-            ml = cscene.mesh_light_samples
-            sss = cscene.subsurface_samples
-            vol = cscene.volume_samples
-
-            if cscene.use_square_samples:
-                aa = aa * aa
-                d = d * d
-                g = g * g
-                t = t * t
-                ao = ao * ao
-                ml = ml * ml
-                sss = sss * sss
-                vol = vol * vol
-
-        col = layout.column(align=True)
-        col.scale_y = 0.6
-        if integrator == 'PATH':
-            col.label(text="%s AA" % aa)
-        else:
-            col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
-                      (aa, d * aa, g * aa, t * aa))
-            col.separator()
-            col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
-                      (ao * aa, ml * aa, sss * aa, vol * aa))
-
-
 class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
     bl_label = "Subdivision"
     bl_options = {'DEFAULT_CLOSED'}
@@ -548,6 +453,8 @@ class CYCLES_RENDER_PT_light_paths_fast_gi(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
+        layout.active = cscene.use_fast_gi
+
         col = layout.column(align=True)
         col.prop(cscene, "ao_bounces", text="Viewport Bounces")
         col.prop(cscene, "ao_bounces_render", text="Render Bounces")
@@ -716,19 +623,13 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
         layout.use_property_decorate = False
 
         scene = context.scene
-        rd = scene.render
         cscene = scene.cycles
 
         col = layout.column()
-
-        sub = col.column(align=True)
-        sub.prop(rd, "tile_x", text="Tiles X")
-        sub.prop(rd, "tile_y", text="Y")
-        col.prop(cscene, "tile_order", text="Order")
-
+        col.prop(cscene, "use_auto_tile")
         sub = col.column()
-        sub.active = not rd.use_save_buffers and not cscene.use_adaptive_sampling
-        sub.prop(cscene, "use_progressive_refine")
+        sub.active = cscene.use_auto_tile
+        sub.prop(cscene, "tile_size")
 
 
 class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Panel):
@@ -778,7 +679,6 @@ class CYCLES_RENDER_PT_performance_final_render(CyclesButtonsPanel, Panel):
 
         col = layout.column()
 
-        col.prop(rd, "use_save_buffers")
         col.prop(rd, "use_persistent_data", text="Persistent Data")
 
 
@@ -797,7 +697,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         col.prop(rd, "preview_pixel_size", text="Pixel Size")
-        col.prop(cscene, "preview_start_resolution", text="Start Pixels")
 
 
 class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
@@ -818,7 +717,6 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
 
         col = layout.column(heading="Include")
         col.prop(view_layer, "use_sky", text="Environment")
-        col.prop(view_layer, "use_ao", text="Ambient Occlusion")
         col.prop(view_layer, "use_solid", text="Surfaces")
         col.prop(view_layer, "use_strand", text="Hair")
         col.prop(view_layer, "use_volumes", text="Volumes")
@@ -827,6 +725,9 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
         sub = col.row()
         sub.prop(view_layer, "use_motion_blur", text="Motion Blur")
         sub.active = rd.use_motion_blur
+        sub = col.row()
+        sub.prop(view_layer.cycles, 'use_denoising', text='Denoising')
+        sub.active = scene.cycles.use_denoising
 
 
 class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel):
@@ -872,6 +773,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_pass_combined")
         col.prop(view_layer, "use_pass_z")
         col.prop(view_layer, "use_pass_mist")
+        col.prop(view_layer, "use_pass_position")
         col.prop(view_layer, "use_pass_normal")
         sub = col.column()
         sub.active = not rd.use_motion_blur
@@ -928,6 +830,7 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_pass_environment")
         col.prop(view_layer, "use_pass_shadow")
         col.prop(view_layer, "use_pass_ambient_occlusion", text="Ambient Occlusion")
+        col.prop(cycles_view_layer, "use_pass_shadow_catcher")
 
 
 class CYCLES_RENDER_PT_passes_crypto(CyclesButtonsPanel, ViewLayerCryptomattePanel, Panel):
@@ -942,70 +845,6 @@ class CYCLES_RENDER_PT_passes_aov(CyclesButtonsPanel, ViewLayerAOVPanel):
     bl_parent_id = "CYCLES_RENDER_PT_passes"
 
 
-class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
-    bl_label = "Denoising"
-    bl_context = "view_layer"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    @classmethod
-    def poll(cls, context):
-        cscene = context.scene.cycles
-        return CyclesButtonsPanel.poll(context) and cscene.use_denoising
-
-    def draw_header(self, context):
-        scene = context.scene
-        view_layer = context.view_layer
-        cycles_view_layer = view_layer.cycles
-
-        layout = self.layout
-        layout.prop(cycles_view_layer, "use_denoising", text="")
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
-
-        scene = context.scene
-        view_layer = context.view_layer
-        cycles_view_layer = view_layer.cycles
-        denoiser = scene.cycles.denoiser
-
-        layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising
-
-        col = layout.column()
-
-        if denoiser == 'OPTIX':
-            col.prop(cycles_view_layer, "denoising_optix_input_passes")
-            return
-        elif denoiser == 'OPENIMAGEDENOISE':
-            col.prop(cycles_view_layer, "denoising_openimagedenoise_input_passes")
-            return
-
-        col.prop(cycles_view_layer, "denoising_radius", text="Radius")
-
-        col = layout.column()
-        col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
-        col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
-        col.prop(cycles_view_layer, "denoising_relative_pca")
-
-        layout.separator()
-
-        col = layout.column()
-        col.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
-
-        row = col.row(heading="Diffuse", align=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True)
-
-        row = col.row(heading="Glossy", align=True)
-        row.prop(cycles_view_layer, "denoising_glossy_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_glossy_indirect", text="Indirect", toggle=True)
-
-        row = col.row(heading="Transmission", align=True)
-        row.prop(cycles_view_layer, "denoising_transmission_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_transmission_indirect", text="Indirect", toggle=True)
-
-
 class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1417,10 +1256,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
 
         if not (light.type == 'AREA' and clamp.is_portal):
             sub = col.column()
-            if use_branched_path(context):
-                subsub = sub.row(align=True)
-                subsub.active = use_sample_all_lights(context)
-                subsub.prop(clamp, "samples")
             sub.prop(clamp, "max_bounces")
 
         sub = col.column(align=True)
@@ -1526,34 +1361,6 @@ class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
 
 
-class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
-    bl_label = "Ambient Occlusion"
-    bl_context = "world"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    @classmethod
-    def poll(cls, context):
-        return context.world and CyclesButtonsPanel.poll(context)
-
-    def draw_header(self, context):
-        light = context.world.light_settings
-        self.layout.prop(light, "use_ambient_occlusion", text="")
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
-
-        light = context.world.light_settings
-        scene = context.scene
-
-        col = layout.column()
-        sub = col.column()
-        sub.active = light.use_ambient_occlusion or scene.render.use_simplify
-        sub.prop(light, "ao_factor", text="Factor")
-        col.prop(light, "distance", text="Distance")
-
-
 class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel):
     bl_label = "Mist Pass"
     bl_context = "world"
@@ -1650,10 +1457,6 @@ class CYCLES_WORLD_PT_settings_surface(CyclesButtonsPanel, Panel):
         subsub = sub.row(align=True)
         subsub.active = cworld.sampling_method == 'MANUAL'
         subsub.prop(cworld, "sample_map_resolution")
-        if use_branched_path(context):
-            subsub = sub.column(align=True)
-            subsub.active = use_sample_all_lights(context)
-            subsub.prop(cworld, "samples")
         sub.prop(cworld, "max_bounces")
 
 
@@ -1677,8 +1480,7 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel):
         col = layout.column()
 
         sub = col.column()
-        sub.active = use_cpu(context)
-        sub.prop(cworld, "volume_sampling", text="Sampling")
+        col.prop(cworld, "volume_sampling", text="Sampling")
         col.prop(cworld, "volume_interpolation", text="Interpolation")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
         sub = col.column()
@@ -1817,8 +1619,7 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         sub = col.column()
-        sub.active = use_cpu(context)
-        sub.prop(cmat, "volume_sampling", text="Sampling")
+        col.prop(cmat, "volume_sampling", text="Sampling")
         col.prop(cmat, "volume_interpolation", text="Interpolation")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
         sub = col.column()
@@ -1845,9 +1646,6 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
         cbk = scene.render.bake
         rd = scene.render
 
-        if use_optix(context):
-            layout.label(text="Baking is performed using CUDA instead of OptiX", icon='INFO')
-
         if rd.use_bake_multires:
             layout.operator("object.bake_image", icon='RENDER_STILL')
             layout.prop(rd, "use_bake_multires")
@@ -1905,7 +1703,6 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
             col.prop(cbk, "use_pass_diffuse")
             col.prop(cbk, "use_pass_glossy")
             col.prop(cbk, "use_pass_transmission")
-            col.prop(cbk, "use_pass_ambient_occlusion")
             col.prop(cbk, "use_pass_emit")
 
         elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
@@ -1989,19 +1786,12 @@ class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel):
                 layout.prop(cbk, "use_clear", text="Clear Image")
 
 
-class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
     bl_label = "Debug"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
     COMPAT_ENGINES = {'CYCLES'}
 
-    @classmethod
-    def poll(cls, context):
-        prefs = bpy.context.preferences
-        return (CyclesButtonsPanel.poll(context)
-                and prefs.experimental.use_cycles_debug
-                and prefs.view.show_developer_ui)
-
     def draw(self, context):
         layout = self.layout
 
@@ -2018,29 +1808,18 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_bvh_layout")
-        col.prop(cscene, "debug_use_cpu_split_kernel")
 
         col.separator()
 
         col = layout.column()
         col.label(text="CUDA Flags:")
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
-        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col.separator()
 
         col = layout.column()
         col.label(text="OptiX Flags:")
-        col.prop(cscene, "debug_optix_cuda_streams")
-        col.prop(cscene, "debug_optix_curves_api")
-
-        col.separator()
-
-        col = layout.column()
-        col.label(text="OpenCL Flags:")
-        col.prop(cscene, "debug_opencl_device_type", text="Device")
-        col.prop(cscene, "debug_use_opencl_debug", text="Debug")
-        col.prop(cscene, "debug_opencl_mem_limit")
+        col.prop(cscene, "debug_use_optix_debug")
 
         col.separator()
 
@@ -2141,20 +1920,22 @@ class CYCLES_RENDER_PT_simplify_culling(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "distance_cull_margin", text="")
 
 
-class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
+class CyclesShadingButtonsPanel(CyclesButtonsPanel):
     bl_space_type = 'VIEW_3D'
     bl_region_type = 'HEADER'
-    bl_label = "Render Pass"
     bl_parent_id = 'VIEW3D_PT_shading'
-    COMPAT_ENGINES = {'CYCLES'}
 
     @classmethod
     def poll(cls, context):
         return (
-            context.engine in cls.COMPAT_ENGINES and
+            CyclesButtonsPanel.poll(context) and
             context.space_data.shading.type == 'RENDERED'
         )
 
+
+class CYCLES_VIEW3D_PT_shading_render_pass(CyclesShadingButtonsPanel, Panel):
+    bl_label = "Render Pass"
+
     def draw(self, context):
         shading = context.space_data.shading
 
@@ -2162,6 +1943,26 @@ class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
         layout.prop(shading.cycles, "render_pass", text="")
 
 
+class CYCLES_VIEW3D_PT_shading_debug(CyclesDebugButtonsPanel,
+                                     CyclesShadingButtonsPanel,
+                                     Panel):
+    bl_label = "Debug"
+
+    @classmethod
+    def poll(cls, context):
+        return (
+            CyclesDebugButtonsPanel.poll(context) and
+            CyclesShadingButtonsPanel.poll(context)
+        )
+
+    def draw(self, context):
+        shading = context.space_data.shading
+
+        layout = self.layout
+        layout.active = context.scene.cycles.use_preview_adaptive_sampling
+        layout.prop(shading.cycles, "show_active_pixels")
+
+
 class CYCLES_VIEW3D_PT_shading_lighting(Panel):
     bl_space_type = 'VIEW_3D'
     bl_region_type = 'HEADER'
@@ -2275,11 +2076,13 @@ def get_panels():
 
 classes = (
     CYCLES_PT_sampling_presets,
+    CYCLES_PT_viewport_sampling_presets,
     CYCLES_PT_integrator_presets,
     CYCLES_RENDER_PT_sampling,
-    CYCLES_RENDER_PT_sampling_sub_samples,
-    CYCLES_RENDER_PT_sampling_adaptive,
-    CYCLES_RENDER_PT_sampling_denoising,
+    CYCLES_RENDER_PT_sampling_viewport,
+    CYCLES_RENDER_PT_sampling_viewport_denoise,
+    CYCLES_RENDER_PT_sampling_render,
+    CYCLES_RENDER_PT_sampling_render_denoise,
     CYCLES_RENDER_PT_sampling_advanced,
     CYCLES_RENDER_PT_light_paths,
     CYCLES_RENDER_PT_light_paths_max_bounces,
@@ -2296,6 +2099,7 @@ classes = (
     CYCLES_VIEW3D_PT_simplify_greasepencil,
     CYCLES_VIEW3D_PT_shading_lighting,
     CYCLES_VIEW3D_PT_shading_render_pass,
+    CYCLES_VIEW3D_PT_shading_debug,
     CYCLES_RENDER_PT_motion_blur,
     CYCLES_RENDER_PT_motion_blur_curve,
     CYCLES_RENDER_PT_film,
@@ -2314,7 +2118,6 @@ classes = (
     CYCLES_RENDER_PT_passes_aov,
     CYCLES_RENDER_PT_filter,
     CYCLES_RENDER_PT_override,
-    CYCLES_RENDER_PT_denoising,
     CYCLES_PT_post_processing,
     CYCLES_CAMERA_PT_dof,
     CYCLES_CAMERA_PT_dof_aperture,
@@ -2333,7 +2136,6 @@ classes = (
     CYCLES_WORLD_PT_preview,
     CYCLES_WORLD_PT_surface,
     CYCLES_WORLD_PT_volume,
-    CYCLES_WORLD_PT_ambient_occlusion,
     CYCLES_WORLD_PT_mist,
     CYCLES_WORLD_PT_ray_visibility,
     CYCLES_WORLD_PT_settings,
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 827f84b9873..57da7d7995c 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -109,7 +109,7 @@ def do_versions(self):
         library_versions.setdefault(library.version, []).append(library)
 
     # Do versioning per library, since they might have different versions.
-    max_need_versioning = (2, 93, 7)
+    max_need_versioning = (3, 0, 25)
     for version, libraries in library_versions.items():
         if version > max_need_versioning:
             continue
@@ -166,10 +166,6 @@ def do_versions(self):
                 if not cscene.is_property_set("filter_type"):
                     cscene.pixel_filter_type = 'GAUSSIAN'
 
-                # Tile Order
-                if not cscene.is_property_set("tile_order"):
-                    cscene.tile_order = 'CENTER'
-
             if version <= (2, 76, 10):
                 cscene = scene.cycles
                 if cscene.is_property_set("filter_type"):
@@ -186,10 +182,6 @@ def do_versions(self):
             if version <= (2, 79, 0):
                 cscene = scene.cycles
                 # Default changes
-                if not cscene.is_property_set("aa_samples"):
-                    cscene.aa_samples = 4
-                if not cscene.is_property_set("preview_aa_samples"):
-                    cscene.preview_aa_samples = 4
                 if not cscene.is_property_set("blur_glossy"):
                     cscene.blur_glossy = 0.0
                 if not cscene.is_property_set("sample_clamp_indirect"):
@@ -203,7 +195,6 @@ def do_versions(self):
                     view_layer.use_pass_cryptomatte_material = cview_layer.get("use_pass_crypto_material", False)
                     view_layer.use_pass_cryptomatte_asset = cview_layer.get("use_pass_crypto_asset", False)
                     view_layer.pass_cryptomatte_depth = cview_layer.get("pass_crypto_depth", 6)
-                    view_layer.use_pass_cryptomatte_accurate = cview_layer.get("pass_crypto_accurate", True)
 
             if version <= (2, 93, 7):
                 if scene.render.engine == 'CYCLES':
@@ -229,6 +220,35 @@ def do_versions(self):
                     cscene.ao_bounces = 1
                     cscene.ao_bounces_render = 1
 
+            if version <= (3, 0, 25):
+                cscene = scene.cycles
+
+                # Default changes.
+                if not cscene.is_property_set("samples"):
+                    cscene.samples = 128
+                if not cscene.is_property_set("preview_samples"):
+                    cscene.preview_samples = 32
+                if not cscene.is_property_set("use_adaptive_sampling"):
+                    cscene.use_adaptive_sampling = False
+                    cscene.use_preview_adaptive_sampling = False
+                if not cscene.is_property_set("use_denoising"):
+                    cscene.use_denoising = False
+                if not cscene.is_property_set("use_preview_denoising"):
+                    cscene.use_preview_denoising = False
+                if not cscene.is_property_set("sampling_pattern"):
+                    cscene.sampling_pattern = 'PROGRESSIVE_MUTI_JITTER'
+
+                # Removal of square samples.
+                cscene = scene.cycles
+                use_square_samples = cscene.get("use_square_samples", False)
+
+                if use_square_samples:
+                    cscene.samples *= cscene.samples
+                    cscene.preview_samples *= cscene.preview_samples
+                    for layer in scene.view_layers:
+                        layer.samples *= layer.samples
+                    cscene["use_square_samples"] = False
+
         # Lamps
         for light in bpy.data.lights:
             if light.library not in libraries:
@@ -249,10 +269,6 @@ def do_versions(self):
             if version <= (2, 76, 9):
                 cworld = world.cycles
 
-                # World MIS Samples
-                if not cworld.is_property_set("samples"):
-                    cworld.samples = 4
-
                 # World MIS Resolution
                 if not cworld.is_property_set("sample_map_resolution"):
                     cworld.sample_map_resolution = 256
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 6954c5c2f26..4e8df5a99a6 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -894,12 +894,8 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d,
   }
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
-                                            BL::RegionView3D &b_rv3d,
-                                            Camera *cam,
-                                            int width,
-                                            int height,
-                                            const bool use_denoiser)
+BufferParams BlenderSync::get_buffer_params(
+    BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height)
 {
   BufferParams params;
   bool use_border = false;
@@ -931,11 +927,6 @@ BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
     params.height = height;
   }
 
-  PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
-
-  /* Can only denoise the combined image pass */
-  params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser;
-
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index d51b31de638..ce1770f18a3 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -25,7 +25,6 @@ CCL_NAMESPACE_BEGIN
 enum ComputeDevice {
   COMPUTE_DEVICE_CPU = 0,
   COMPUTE_DEVICE_CUDA = 1,
-  COMPUTE_DEVICE_OPENCL = 2,
   COMPUTE_DEVICE_OPTIX = 3,
 
   COMPUTE_DEVICE_NUM
@@ -68,13 +67,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       device = Device::get_multi_device(devices, threads, background);
     }
   }
-  else if (get_enum(cscene, "device") == 2) {
-    /* Find network device. */
-    vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK);
-    if (!devices.empty()) {
-      device = devices.front();
-    }
-  }
   else if (get_enum(cscene, "device") == 1) {
     /* Test if we are using GPU devices. */
     ComputeDevice compute_device = (ComputeDevice)get_enum(
@@ -89,9 +81,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       else if (compute_device == COMPUTE_DEVICE_OPTIX) {
         mask |= DEVICE_MASK_OPTIX;
       }
-      else if (compute_device == COMPUTE_DEVICE_OPENCL) {
-        mask |= DEVICE_MASK_OPENCL;
-      }
       vector<DeviceInfo> devices = Device::available_devices(mask);
 
       /* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp
new file mode 100644
index 00000000000..a79232af71f
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -0,0 +1,761 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blender/blender_gpu_display.h"
+
+#include "device/device.h"
+#include "util/util_logging.h"
+#include "util/util_opengl.h"
+
+extern "C" {
+struct RenderEngine;
+
+bool RE_engine_has_render_context(struct RenderEngine *engine);
+void RE_engine_render_context_enable(struct RenderEngine *engine);
+void RE_engine_render_context_disable(struct RenderEngine *engine);
+
+bool DRW_opengl_context_release();
+void DRW_opengl_context_activate(bool drw_state);
+
+void *WM_opengl_context_create();
+void WM_opengl_context_activate(void *gl_context);
+void WM_opengl_context_dispose(void *gl_context);
+void WM_opengl_context_release(void *context);
+}
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * BlenderDisplayShader.
+ */
+
+unique_ptr<BlenderDisplayShader> BlenderDisplayShader::create(BL::RenderEngine &b_engine,
+                                                              BL::Scene &b_scene)
+{
+  if (b_engine.support_display_space_shader(b_scene)) {
+    return make_unique<BlenderDisplaySpaceShader>(b_engine, b_scene);
+  }
+
+  return make_unique<BlenderFallbackDisplayShader>();
+}
+
+int BlenderDisplayShader::get_position_attrib_location()
+{
+  if (position_attribute_location_ == -1) {
+    const uint shader_program = get_shader_program();
+    position_attribute_location_ = glGetAttribLocation(shader_program, position_attribute_name);
+  }
+  return position_attribute_location_;
+}
+
+int BlenderDisplayShader::get_tex_coord_attrib_location()
+{
+  if (tex_coord_attribute_location_ == -1) {
+    const uint shader_program = get_shader_program();
+    tex_coord_attribute_location_ = glGetAttribLocation(shader_program, tex_coord_attribute_name);
+  }
+  return tex_coord_attribute_location_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderFallbackDisplayShader.
+ */
+
+/* TODO move shaders to standalone .glsl file. */
+static const char *FALLBACK_VERTEX_SHADER =
+    "#version 330\n"
+    "uniform vec2 fullscreen;\n"
+    "in vec2 texCoord;\n"
+    "in vec2 pos;\n"
+    "out vec2 texCoord_interp;\n"
+    "\n"
+    "vec2 normalize_coordinates()\n"
+    "{\n"
+    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
+    "}\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
+    "   texCoord_interp = texCoord;\n"
+    "}\n\0";
+
+static const char *FALLBACK_FRAGMENT_SHADER =
+    "#version 330\n"
+    "uniform sampler2D image_texture;\n"
+    "in vec2 texCoord_interp;\n"
+    "out vec4 fragColor;\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   fragColor = texture(image_texture, texCoord_interp);\n"
+    "}\n\0";
+
+static void shader_print_errors(const char *task, const char *log, const char *code)
+{
+  LOG(ERROR) << "Shader: " << task << " error:";
+  LOG(ERROR) << "===== shader string ====";
+
+  stringstream stream(code);
+  string partial;
+
+  int line = 1;
+  while (getline(stream, partial, '\n')) {
+    if (line < 10) {
+      LOG(ERROR) << " " << line << " " << partial;
+    }
+    else {
+      LOG(ERROR) << line << " " << partial;
+    }
+    line++;
+  }
+  LOG(ERROR) << log;
+}
+
+static int compile_fallback_shader(void)
+{
+  const struct Shader {
+    const char *source;
+    const GLenum type;
+  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
+                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
+
+  const GLuint program = glCreateProgram();
+
+  for (int i = 0; i < 2; i++) {
+    const GLuint shader = glCreateShader(shaders[i].type);
+
+    string source_str = shaders[i].source;
+    const char *c_str = source_str.c_str();
+
+    glShaderSource(shader, 1, &c_str, NULL);
+    glCompileShader(shader);
+
+    GLint compile_status;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
+
+    if (!compile_status) {
+      GLchar log[5000];
+      GLsizei length = 0;
+      glGetShaderInfoLog(shader, sizeof(log), &length, log);
+      shader_print_errors("compile", log, c_str);
+      return 0;
+    }
+
+    glAttachShader(program, shader);
+  }
+
+  /* Link output. */
+  glBindFragDataLocation(program, 0, "fragColor");
+
+  /* Link and error check. */
+  glLinkProgram(program);
+
+  /* TODO(sergey): Find a way to nicely de-duplicate the error checking. */
+  GLint link_status;
+  glGetProgramiv(program, GL_LINK_STATUS, &link_status);
+  if (!link_status) {
+    GLchar log[5000];
+    GLsizei length = 0;
+    /* TODO(sergey): Is it really program passed to glGetShaderInfoLog? */
+    glGetShaderInfoLog(program, sizeof(log), &length, log);
+    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
+    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
+    return 0;
+  }
+
+  return program;
+}
+
+void BlenderFallbackDisplayShader::bind(int width, int height)
+{
+  create_shader_if_needed();
+
+  if (!shader_program_) {
+    return;
+  }
+
+  glUseProgram(shader_program_);
+  glUniform1i(image_texture_location_, 0);
+  glUniform2f(fullscreen_location_, width, height);
+}
+
+void BlenderFallbackDisplayShader::unbind()
+{
+}
+
+uint BlenderFallbackDisplayShader::get_shader_program()
+{
+  return shader_program_;
+}
+
+void BlenderFallbackDisplayShader::create_shader_if_needed()
+{
+  if (shader_program_ || shader_compile_attempted_) {
+    return;
+  }
+
+  shader_compile_attempted_ = true;
+
+  shader_program_ = compile_fallback_shader();
+  if (!shader_program_) {
+    return;
+  }
+
+  glUseProgram(shader_program_);
+
+  image_texture_location_ = glGetUniformLocation(shader_program_, "image_texture");
+  if (image_texture_location_ < 0) {
+    LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
+    destroy_shader();
+    return;
+  }
+
+  fullscreen_location_ = glGetUniformLocation(shader_program_, "fullscreen");
+  if (fullscreen_location_ < 0) {
+    LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
+    destroy_shader();
+    return;
+  }
+}
+
+void BlenderFallbackDisplayShader::destroy_shader()
+{
+  glDeleteProgram(shader_program_);
+  shader_program_ = 0;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderDisplaySpaceShader.
+ */
+
+BlenderDisplaySpaceShader::BlenderDisplaySpaceShader(BL::RenderEngine &b_engine,
+                                                     BL::Scene &b_scene)
+    : b_engine_(b_engine), b_scene_(b_scene)
+{
+  DCHECK(b_engine_.support_display_space_shader(b_scene_));
+}
+
+void BlenderDisplaySpaceShader::bind(int /*width*/, int /*height*/)
+{
+  b_engine_.bind_display_space_shader(b_scene_);
+}
+
+void BlenderDisplaySpaceShader::unbind()
+{
+  b_engine_.unbind_display_space_shader();
+}
+
+uint BlenderDisplaySpaceShader::get_shader_program()
+{
+  if (!shader_program_) {
+    glGetIntegerv(GL_CURRENT_PROGRAM, reinterpret_cast<int *>(&shader_program_));
+  }
+
+  if (!shader_program_) {
+    LOG(ERROR) << "Error retrieving shader program for display space shader.";
+  }
+
+  return shader_program_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderGPUDisplay.
+ */
+
+BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene)
+    : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
+{
+  /* Create context while on the main thread. */
+  gl_context_create();
+}
+
+BlenderGPUDisplay::~BlenderGPUDisplay()
+{
+  gl_resources_destroy();
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams &params,
+                                        int texture_width,
+                                        int texture_height)
+{
+  /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing
+   * the texture does not happen at the same time. This is achieved indirectly.
+   *
+   * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock.
+   * This same lock is also held when do_draw() is called, which together ensure mutual
+   * exclusion.
+   *
+   * This locking is not performed at the GPU display level, because that would cause lock
+   * inversion. */
+  if (!gl_context_enable()) {
+    return false;
+  }
+
+  if (gl_render_sync_) {
+    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (!gl_texture_resources_ensure()) {
+    gl_context_disable();
+    return false;
+  }
+
+  /* Update texture dimensions if needed. */
+  if (texture_.width != texture_width || texture_.height != texture_height) {
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+    glTexImage2D(
+        GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0);
+    texture_.width = texture_width;
+    texture_.height = texture_height;
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to
+     * avoid undefined content. */
+    texture_.need_clear = true;
+  }
+
+  /* Update PBO dimensions if needed.
+   *
+   * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+   * at a resolution divider 1. This was we don't need to recreate graphics interoperability
+   * objects which are costly and which are tied to the specific underlying buffer size.
+   * The downside of this approach is that when graphics interopeability is not used we are sending
+   * too much data to GPU when resolution divider is not 1. */
+  /* TODO(sergey): Investigate whether keeping the PBO exact size of the texute makes non-interop
+   * mode faster. */
+  const int buffer_width = params.full_size.x;
+  const int buffer_height = params.full_size.y;
+  if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
+    const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    texture_.buffer_width = buffer_width;
+    texture_.buffer_height = buffer_height;
+  }
+
+  /* New content will be provided to the texture in one way or another, so mark this in a
+   * centralized place. */
+  texture_.need_update = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::do_update_end()
+{
+  gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  glFlush();
+
+  gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void BlenderGPUDisplay::do_copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+  /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time
+   * point of view than to copy data directly to the OpenGL texture.
+   *
+   * The possible downside of this approach is that it might require a higher peak memory when
+   * doing partial updates of the texture (although, in practice even partial updates might peak
+   * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
+
+  half4 *mapped_rgba_pixels = map_texture_buffer();
+  if (!mapped_rgba_pixels) {
+    return;
+  }
+
+  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width &&
+      pixels_height == texture_.height) {
+    const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
+    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+  }
+  else {
+    const half4 *rgba_row = rgba_pixels;
+    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x;
+    for (int y = 0; y < pixels_height;
+         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
+      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+    }
+  }
+
+  unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *BlenderGPUDisplay::do_map_texture_buffer()
+{
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+
+  half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
+      glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
+  if (!mapped_rgba_pixels) {
+    LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object.";
+  }
+
+  if (texture_.need_clear) {
+    const int64_t texture_width = texture_.width;
+    const int64_t texture_height = texture_.height;
+    memset(reinterpret_cast<void *>(mapped_rgba_pixels),
+           0,
+           texture_width * texture_height * sizeof(half4));
+    texture_.need_clear = false;
+  }
+
+  return mapped_rgba_pixels;
+}
+
+void BlenderGPUDisplay::do_unmap_texture_buffer()
+{
+  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get()
+{
+  DeviceGraphicsInteropDestination interop_dst;
+
+  interop_dst.buffer_width = texture_.buffer_width;
+  interop_dst.buffer_height = texture_.buffer_height;
+  interop_dst.opengl_pbo_id = texture_.gl_pbo_id;
+
+  interop_dst.need_clear = texture_.need_clear;
+  texture_.need_clear = false;
+
+  return interop_dst;
+}
+
+void BlenderGPUDisplay::graphics_interop_activate()
+{
+  gl_context_enable();
+}
+
+void BlenderGPUDisplay::graphics_interop_deactivate()
+{
+  gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+void BlenderGPUDisplay::clear()
+{
+  texture_.need_clear = true;
+}
+
+void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
+{
+  /* See do_update_begin() for why no locking is required here. */
+  const bool transparent = true;  // TODO(sergey): Derive this from Film.
+
+  if (texture_.need_clear) {
+    /* Texture is requested to be cleared and was not yet cleared.
+     * Do early return which should be equivalent of drawing all-zero texture. */
+    return;
+  }
+
+  if (!gl_draw_resources_ensure()) {
+    return;
+  }
+
+  if (use_gl_context_) {
+    gl_context_mutex_.lock();
+  }
+
+  if (gl_upload_sync_) {
+    glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (transparent) {
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+  }
+
+  display_shader_->bind(params.full_size.x, params.full_size.y);
+
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_);
+
+  texture_update_if_needed();
+  vertex_buffer_update(params);
+
+  /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */
+  GLuint vertex_array_object;
+  glGenVertexArrays(1, &vertex_array_object);
+  glBindVertexArray(vertex_array_object);
+
+  const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location();
+  const int position_attribute = display_shader_->get_position_attrib_location();
+
+  glEnableVertexAttribArray(texcoord_attribute);
+  glEnableVertexAttribArray(position_attribute);
+
+  glVertexAttribPointer(
+      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+  glVertexAttribPointer(position_attribute,
+                        2,
+                        GL_FLOAT,
+                        GL_FALSE,
+                        4 * sizeof(float),
+                        (const GLvoid *)(sizeof(float) * 2));
+
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  glDeleteVertexArrays(1, &vertex_array_object);
+
+  display_shader_->unbind();
+
+  if (transparent) {
+    glDisable(GL_BLEND);
+  }
+
+  gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  glFlush();
+
+  if (use_gl_context_) {
+    gl_context_mutex_.unlock();
+  }
+}
+
+void BlenderGPUDisplay::gl_context_create()
+{
+  /* When rendering in viewport there is no render context available via engine.
+   * Check whether own context is to be created here.
+   *
+   * NOTE: If the `b_engine_`'s context is not available, we are expected to be on a main thread
+   * here. */
+  use_gl_context_ = !RE_engine_has_render_context(
+      reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+
+  if (use_gl_context_) {
+    const bool drw_state = DRW_opengl_context_release();
+    gl_context_ = WM_opengl_context_create();
+    if (gl_context_) {
+      /* On Windows an old context is restored after creation, and subsequent release of context
+       * generates a Win32 error. Harmless for users, but annoying to have possible misleading
+       * error prints in the console. */
+#ifndef _WIN32
+      WM_opengl_context_release(gl_context_);
+#endif
+    }
+    else {
+      LOG(ERROR) << "Error creating OpenGL context.";
+    }
+
+    DRW_opengl_context_activate(drw_state);
+  }
+}
+
+bool BlenderGPUDisplay::gl_context_enable()
+{
+  if (use_gl_context_) {
+    if (!gl_context_) {
+      return false;
+    }
+    gl_context_mutex_.lock();
+    WM_opengl_context_activate(gl_context_);
+    return true;
+  }
+
+  RE_engine_render_context_enable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+  return true;
+}
+
+void BlenderGPUDisplay::gl_context_disable()
+{
+  if (use_gl_context_) {
+    if (gl_context_) {
+      WM_opengl_context_release(gl_context_);
+      gl_context_mutex_.unlock();
+    }
+    return;
+  }
+
+  RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+}
+
+void BlenderGPUDisplay::gl_context_dispose()
+{
+  if (gl_context_) {
+    const bool drw_state = DRW_opengl_context_release();
+
+    WM_opengl_context_activate(gl_context_);
+    WM_opengl_context_dispose(gl_context_);
+
+    DRW_opengl_context_activate(drw_state);
+  }
+}
+
+bool BlenderGPUDisplay::gl_draw_resources_ensure()
+{
+  if (!texture_.gl_id) {
+    /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
+     * can not continue. Note that this is not an unrecoverable error, so once the texture is known
+     * we will come back here and create all the GPU resources needed for draw. */
+    return false;
+  }
+
+  if (gl_draw_resource_creation_attempted_) {
+    return gl_draw_resources_created_;
+  }
+  gl_draw_resource_creation_attempted_ = true;
+
+  if (!vertex_buffer_) {
+    glGenBuffers(1, &vertex_buffer_);
+    if (!vertex_buffer_) {
+      LOG(ERROR) << "Error creating vertex buffer.";
+      return false;
+    }
+  }
+
+  gl_draw_resources_created_ = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::gl_resources_destroy()
+{
+  gl_context_enable();
+
+  if (vertex_buffer_ != 0) {
+    glDeleteBuffers(1, &vertex_buffer_);
+  }
+
+  if (texture_.gl_pbo_id) {
+    glDeleteBuffers(1, &texture_.gl_pbo_id);
+    texture_.gl_pbo_id = 0;
+  }
+
+  if (texture_.gl_id) {
+    glDeleteTextures(1, &texture_.gl_id);
+    texture_.gl_id = 0;
+  }
+
+  gl_context_disable();
+
+  gl_context_dispose();
+}
+
+bool BlenderGPUDisplay::gl_texture_resources_ensure()
+{
+  if (texture_.creation_attempted) {
+    return texture_.is_created;
+  }
+  texture_.creation_attempted = true;
+
+  DCHECK(!texture_.gl_id);
+  DCHECK(!texture_.gl_pbo_id);
+
+  /* Create texture. */
+  glGenTextures(1, &texture_.gl_id);
+  if (!texture_.gl_id) {
+    LOG(ERROR) << "Error creating texture.";
+    return false;
+  }
+
+  /* Configure the texture. */
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  /* Create PBO for the texture. */
+  glGenBuffers(1, &texture_.gl_pbo_id);
+  if (!texture_.gl_pbo_id) {
+    LOG(ERROR) << "Error creating texture pixel buffer object.";
+    return false;
+  }
+
+  /* Creation finished with a success. */
+  texture_.is_created = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::texture_update_if_needed()
+{
+  if (!texture_.need_update) {
+    return;
+  }
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+  glTexSubImage2D(
+      GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  texture_.need_update = false;
+}
+
+void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams &params)
+{
+  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
+   * rendered. */
+  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+  if (!vpointer) {
+    return;
+  }
+
+  vpointer[0] = 0.0f;
+  vpointer[1] = 0.0f;
+  vpointer[2] = params.offset.x;
+  vpointer[3] = params.offset.y;
+
+  vpointer[4] = 1.0f;
+  vpointer[5] = 0.0f;
+  vpointer[6] = (float)params.size.x + params.offset.x;
+  vpointer[7] = params.offset.y;
+
+  vpointer[8] = 1.0f;
+  vpointer[9] = 1.0f;
+  vpointer[10] = (float)params.size.x + params.offset.x;
+  vpointer[11] = (float)params.size.y + params.offset.y;
+
+  vpointer[12] = 0.0f;
+  vpointer[13] = 1.0f;
+  vpointer[14] = params.offset.x;
+  vpointer[15] = (float)params.size.y + params.offset.y;
+
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_gpu_display.h
new file mode 100644
index 00000000000..b7eddf0afa7
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "MEM_guardedalloc.h"
+
+#include "RNA_blender_cpp.h"
+
+#include "render/gpu_display.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Base class of shader used for GPU display rendering. */
+class BlenderDisplayShader {
+ public:
+  static constexpr const char *position_attribute_name = "pos";
+  static constexpr const char *tex_coord_attribute_name = "texCoord";
+
+  /* Create shader implementation suitable for the given render engine and scene configuration. */
+  static unique_ptr<BlenderDisplayShader> create(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+  BlenderDisplayShader() = default;
+  virtual ~BlenderDisplayShader() = default;
+
+  virtual void bind(int width, int height) = 0;
+  virtual void unbind() = 0;
+
+  /* Get attribute location for position and texture coordinate respectively.
+   * NOTE: The shader needs to be bound to have access to those. */
+  virtual int get_position_attrib_location();
+  virtual int get_tex_coord_attrib_location();
+
+ protected:
+  /* Get program of this display shader.
+   * NOTE: The shader needs to be bound to have access to this. */
+  virtual uint get_shader_program() = 0;
+
+  /* Cached values of various OpenGL resources. */
+  int position_attribute_location_ = -1;
+  int tex_coord_attribute_location_ = -1;
+};
+
+/* Implementation of display rendering shader used in the case when render engine does not support
+ * display space shader. */
+class BlenderFallbackDisplayShader : public BlenderDisplayShader {
+ public:
+  virtual void bind(int width, int height) override;
+  virtual void unbind() override;
+
+ protected:
+  virtual uint get_shader_program() override;
+
+  void create_shader_if_needed();
+  void destroy_shader();
+
+  uint shader_program_ = 0;
+  int image_texture_location_ = -1;
+  int fullscreen_location_ = -1;
+
+  /* Shader compilation attempted. Which means, that if the shader program is 0 then compilation or
+   * linking has failed. Do not attempt to re-compile the shader. */
+  bool shader_compile_attempted_ = false;
+};
+
+class BlenderDisplaySpaceShader : public BlenderDisplayShader {
+ public:
+  BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+  virtual void bind(int width, int height) override;
+  virtual void unbind() override;
+
+ protected:
+  virtual uint get_shader_program() override;
+
+  BL::RenderEngine b_engine_;
+  BL::Scene &b_scene_;
+
+  /* Cached values of various OpenGL resources. */
+  uint shader_program_ = 0;
+};
+
+/* GPU display implementation which is specific for Blender viewport integration. */
+class BlenderGPUDisplay : public GPUDisplay {
+ public:
+  BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+  ~BlenderGPUDisplay();
+
+  virtual void graphics_interop_activate() override;
+  virtual void graphics_interop_deactivate() override;
+
+  virtual void clear() override;
+
+ protected:
+  virtual bool do_update_begin(const GPUDisplayParams &params,
+                               int texture_width,
+                               int texture_height) override;
+  virtual void do_update_end() override;
+
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) override;
+  virtual void do_draw(const GPUDisplayParams &params) override;
+
+  virtual half4 *do_map_texture_buffer() override;
+  virtual void do_unmap_texture_buffer() override;
+
+  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override;
+
+  /* Helper function which allocates new GPU context. */
+  void gl_context_create();
+  bool gl_context_enable();
+  void gl_context_disable();
+  void gl_context_dispose();
+
+  /* Make sure texture is allocated and its initial configuration is performed. */
+  bool gl_texture_resources_ensure();
+
+  /* Ensure all runtime GPU resources needefd for drawing are allocated.
+   * Returns true if all resources needed for drawing are available. */
+  bool gl_draw_resources_ensure();
+
+  /* Destroy all GPU resources which are being used by this object. */
+  void gl_resources_destroy();
+
+  /* Update GPU texture dimensions and content if needed (new pixel data was provided).
+   *
+   * NOTE: The texture needs to be bound. */
+  void texture_update_if_needed();
+
+  /* Update vetrex buffer with new coordinates of vertex positions and texture coordinates.
+   * This buffer is used to render texture in the viewport.
+   *
+   * NOTE: The buffer needs to be bound. */
+  void vertex_buffer_update(const GPUDisplayParams &params);
+
+  BL::RenderEngine b_engine_;
+
+  /* OpenGL context which is used the render engine doesn't have its own. */
+  void *gl_context_ = nullptr;
+  /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create
+   * its own context. */
+  bool use_gl_context_ = false;
+  /* Mutex used to guard the `gl_context_`. */
+  thread_mutex gl_context_mutex_;
+
+  /* Texture which contains pixels of the render result. */
+  struct {
+    /* Indicates whether texture creation was attempted and succeeded.
+     * Used to avoid multiple attempts of texture creation on GPU issues or GPU context
+     * misconfiguration. */
+    bool creation_attempted = false;
+    bool is_created = false;
+
+    /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write
+     * pixels to it.
+     *
+     * NOTE: Allocated on the engine's context. */
+    uint gl_id = 0;
+    uint gl_pbo_id = 0;
+
+    /* Is true when new data was written to the PBO, meaning, the texture might need to be resized
+     * and new data is to be uploaded to the GPU. */
+    bool need_update = false;
+
+    /* Content of the texture is to be filled with zeroes. */
+    std::atomic<bool> need_clear = true;
+
+    /* Dimensions of the texture in pixels. */
+    int width = 0;
+    int height = 0;
+
+    /* Dimensions of the underlying PBO. */
+    int buffer_width = 0;
+    int buffer_height = 0;
+  } texture_;
+
+  unique_ptr<BlenderDisplayShader> display_shader_;
+
+  /* Special track of whether GPU resources were attempted to be created, to avoid attempts of
+   * their re-creation on failure on every redraw. */
+  bool gl_draw_resource_creation_attempted_ = false;
+  bool gl_draw_resources_created_ = false;
+
+  /* Vertex buffer which hold vertrices of a triangle fan which is textures with the texture
+   * holding the render result. */
+  uint vertex_buffer_ = 0;
+
+  void *gl_render_sync_ = nullptr;
+  void *gl_upload_sync_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp
index 542028f4b2f..4df1e720dde 100644
--- a/intern/cycles/blender/blender_light.cpp
+++ b/intern/cycles/blender/blender_light.cpp
@@ -125,17 +125,10 @@ void BlenderSync::sync_light(BL::Object &b_parent,
   light->set_shader(static_cast<Shader *>(used_shaders[0]));
 
   /* shadow */
-  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
   PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
   light->set_cast_shadow(get_boolean(clight, "cast_shadow"));
   light->set_use_mis(get_boolean(clight, "use_multiple_importance_sampling"));
 
-  int samples = get_int(clight, "samples");
-  if (get_boolean(cscene, "use_square_samples"))
-    light->set_samples(samples * samples);
-  else
-    light->set_samples(samples);
-
   light->set_max_bounces(get_int(clight, "max_bounces"));
 
   if (b_ob_info.real_object != b_ob_info.iter_object) {
@@ -155,10 +148,12 @@ void BlenderSync::sync_light(BL::Object &b_parent,
 
   /* visibility */
   uint visibility = object_ray_visibility(b_ob_info.real_object);
+  light->set_use_camera((visibility & PATH_RAY_CAMERA) != 0);
   light->set_use_diffuse((visibility & PATH_RAY_DIFFUSE) != 0);
   light->set_use_glossy((visibility & PATH_RAY_GLOSSY) != 0);
   light->set_use_transmission((visibility & PATH_RAY_TRANSMIT) != 0);
   light->set_use_scatter((visibility & PATH_RAY_VOLUME_SCATTER) != 0);
+  light->set_is_shadow_catcher(b_ob_info.real_object.is_shadow_catcher());
 
   /* tag */
   light->tag_update(scene);
@@ -169,7 +164,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
   BL::World b_world = b_scene.world();
 
   if (b_world) {
-    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
     PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 
     enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
@@ -197,12 +191,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
         /* force enable light again when world is resynced */
         light->set_is_enabled(true);
 
-        int samples = get_int(cworld, "samples");
-        if (get_boolean(cscene, "use_square_samples"))
-          light->set_samples(samples * samples);
-        else
-          light->set_samples(samples);
-
         light->tag_update(scene);
         light_map.set_recalc(b_world);
       }
@@ -211,7 +199,7 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
 
   world_map = b_world.ptr.data;
   world_recalc = false;
-  viewport_parameters = BlenderViewportParameters(b_v3d);
+  viewport_parameters = BlenderViewportParameters(b_v3d, use_developer_ui);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 22d6edeb099..95da4a2df84 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -568,7 +568,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
   /* object loop */
   bool cancel = false;
   bool use_portal = false;
-  const bool show_lights = BlenderViewportParameters(b_v3d).use_scene_lights;
+  const bool show_lights = BlenderViewportParameters(b_v3d, use_developer_ui).use_scene_lights;
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
   BL::Depsgraph::object_instances_iterator b_instance_iter;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 6e06b6a468f..694d8454422 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -45,10 +45,6 @@
 #  include <OSL/oslquery.h>
 #endif
 
-#ifdef WITH_OPENCL
-#  include "device/device_intern.h"
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 namespace {
@@ -72,12 +68,10 @@ PyObject *pyunicode_from_string(const char *str)
 /* Synchronize debug flags from a given Blender scene.
  * Return truth when device list needs invalidation.
  */
-bool debug_flags_sync_from_scene(BL::Scene b_scene)
+static void debug_flags_sync_from_scene(BL::Scene b_scene)
 {
   DebugFlagsRef flags = DebugFlags();
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-  /* Backup some settings for comparison. */
-  DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
   /* Synchronize shared flags. */
   flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type");
   /* Synchronize CPU flags. */
@@ -87,50 +81,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
   flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
   flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
-  flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
   /* Synchronize CUDA flags. */
   flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
-  flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
   /* Synchronize OptiX flags. */
-  flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
-  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
-  /* Synchronize OpenCL device type. */
-  switch (get_enum(cscene, "debug_opencl_device_type")) {
-    case 0:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
-      break;
-    case 1:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ALL;
-      break;
-    case 2:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
-      break;
-    case 3:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_CPU;
-      break;
-    case 4:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_GPU;
-      break;
-    case 5:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
-      break;
-  }
-  /* Synchronize other OpenCL flags. */
-  flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
-  flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit")) * 1024 * 1024;
-  return flags.opencl.device_type != opencl_device_type;
+  flags.optix.use_debug = get_boolean(cscene, "debug_use_optix_debug");
 }
 
 /* Reset debug flags to default values.
  * Return truth when device list needs invalidation.
  */
-bool debug_flags_reset()
+static void debug_flags_reset()
 {
   DebugFlagsRef flags = DebugFlags();
-  /* Backup some settings for comparison. */
-  DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
   flags.reset();
-  return flags.opencl.device_type != opencl_device_type;
 }
 
 } /* namespace */
@@ -175,18 +138,20 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
 
 static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 {
-  PyObject *path, *user_path;
+  PyObject *path, *user_path, *temp_path;
   int headless;
 
-  if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
-    return NULL;
+  if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) {
+    return nullptr;
   }
 
-  PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr;
   path_init(PyC_UnicodeAsByte(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce));
+            PyC_UnicodeAsByte(user_path, &user_path_coerce),
+            PyC_UnicodeAsByte(temp_path, &temp_path_coerce));
   Py_XDECREF(path_coerce);
   Py_XDECREF(user_path_coerce);
+  Py_XDECREF(temp_path_coerce);
 
   BlenderSession::headless = headless;
 
@@ -299,6 +264,50 @@ static PyObject *render_func(PyObject * /*self*/, PyObject *args)
   Py_RETURN_NONE;
 }
 
+static PyObject *render_frame_finish_func(PyObject * /*self*/, PyObject *args)
+{
+  PyObject *pysession;
+
+  if (!PyArg_ParseTuple(args, "O", &pysession)) {
+    return nullptr;
+  }
+
+  BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(pysession);
+
+  /* Allow Blender to execute other Python scripts. */
+  python_thread_state_save(&session->python_thread_state);
+
+  session->render_frame_finish();
+
+  python_thread_state_restore(&session->python_thread_state);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+{
+  PyObject *py_session, *py_graph, *py_screen, *py_space_image;
+
+  if (!PyArg_ParseTuple(args, "OOOO", &py_session, &py_graph, &py_screen, &py_space_image)) {
+    return nullptr;
+  }
+
+  BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(py_session);
+
+  ID *b_screen = (ID *)PyLong_AsVoidPtr(py_screen);
+
+  PointerRNA b_space_image_ptr;
+  RNA_pointer_create(b_screen,
+                     &RNA_SpaceImageEditor,
+                     pylong_as_voidptr_typesafe(py_space_image),
+                     &b_space_image_ptr);
+  BL::SpaceImageEditor b_space_image(b_space_image_ptr);
+
+  session->draw(b_space_image);
+
+  Py_RETURN_NONE;
+}
+
 /* pixel_array and result passed as pointers */
 static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
 {
@@ -336,7 +345,7 @@ static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
   Py_RETURN_NONE;
 }
 
-static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+static PyObject *view_draw_func(PyObject * /*self*/, PyObject *args)
 {
   PyObject *pysession, *pygraph, *pyv3d, *pyrv3d;
 
@@ -350,7 +359,7 @@ static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
     int viewport[4];
     glGetIntegerv(GL_VIEWPORT, viewport);
 
-    session->draw(viewport[2], viewport[3]);
+    session->view_draw(viewport[2], viewport[3]);
   }
 
   Py_RETURN_NONE;
@@ -697,40 +706,6 @@ static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
   return pyunicode_from_string(system_info.c_str());
 }
 
-#ifdef WITH_OPENCL
-static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/)
-{
-  VLOG(2) << "Disabling OpenCL platform.";
-  DebugFlags().opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
-  Py_RETURN_NONE;
-}
-
-static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args)
-{
-  PyObject *sequence = PySequence_Fast(args, "Arguments must be a sequence");
-  if (sequence == NULL) {
-    Py_RETURN_FALSE;
-  }
-
-  vector<string> parameters;
-  for (Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(sequence); i++) {
-    PyObject *item = PySequence_Fast_GET_ITEM(sequence, i);
-    PyObject *item_as_string = PyObject_Str(item);
-    const char *parameter_string = PyUnicode_AsUTF8(item_as_string);
-    parameters.push_back(parameter_string);
-    Py_DECREF(item_as_string);
-  }
-  Py_DECREF(sequence);
-
-  if (device_opencl_compile_kernel(parameters)) {
-    Py_RETURN_TRUE;
-  }
-  else {
-    Py_RETURN_FALSE;
-  }
-}
-#endif
-
 static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepaths)
 {
   if (PyUnicode_Check(pyfilepaths)) {
@@ -762,6 +737,10 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat
 
 static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
 {
+#if 1
+  (void)args;
+  (void)keywords;
+#else
   static const char *keyword_list[] = {
       "preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL};
   PyObject *pypreferences, *pyscene, *pyviewlayer;
@@ -835,7 +814,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
   }
 
   /* Create denoiser. */
-  Denoiser denoiser(device);
+  DenoiserPipeline denoiser(device);
   denoiser.params = params;
   denoiser.input = input;
   denoiser.output = output;
@@ -852,6 +831,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
     PyErr_SetString(PyExc_ValueError, denoiser.error.c_str());
     return NULL;
   }
+#endif
 
   Py_RETURN_NONE;
 }
@@ -903,10 +883,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
   RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr);
   BL::Scene b_scene(sceneptr);
 
-  if (debug_flags_sync_from_scene(b_scene)) {
-    VLOG(2) << "Tagging device list for update.";
-    Device::tag_update();
-  }
+  debug_flags_sync_from_scene(b_scene);
 
   VLOG(2) << "Debug flags set to:\n" << DebugFlags();
 
@@ -917,10 +894,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
 
 static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/)
 {
-  if (debug_flags_reset()) {
-    VLOG(2) << "Tagging device list for update.";
-    Device::tag_update();
-  }
+  debug_flags_reset();
   if (debug_flags_set) {
     VLOG(2) << "Debug flags reset to:\n" << DebugFlags();
     debug_flags_set = false;
@@ -928,84 +902,6 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
   Py_RETURN_NONE;
 }
 
-static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
-{
-  int num_resumable_chunks, current_resumable_chunk;
-  if (!PyArg_ParseTuple(args, "ii", &num_resumable_chunks, &current_resumable_chunk)) {
-    Py_RETURN_NONE;
-  }
-
-  if (num_resumable_chunks <= 0) {
-    fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (current_resumable_chunk < 1 || current_resumable_chunk > num_resumable_chunks) {
-    fprintf(stderr, "Cycles: Bad value for current resumable chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-
-  VLOG(1) << "Initialized resumable render: "
-          << "num_resumable_chunks=" << num_resumable_chunks << ", "
-          << "current_resumable_chunk=" << current_resumable_chunk;
-  BlenderSession::num_resumable_chunks = num_resumable_chunks;
-  BlenderSession::current_resumable_chunk = current_resumable_chunk;
-
-  printf("Cycles: Will render chunk %d of %d\n", current_resumable_chunk, num_resumable_chunks);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
-{
-  int num_chunks, start_chunk, end_chunk;
-  if (!PyArg_ParseTuple(args, "iii", &num_chunks, &start_chunk, &end_chunk)) {
-    Py_RETURN_NONE;
-  }
-
-  if (num_chunks <= 0) {
-    fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (start_chunk < 1 || start_chunk > num_chunks) {
-    fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (end_chunk < 1 || end_chunk > num_chunks) {
-    fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (start_chunk > end_chunk) {
-    fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-
-  VLOG(1) << "Initialized resumable render: "
-          << "num_resumable_chunks=" << num_chunks << ", "
-          << "start_resumable_chunk=" << start_chunk << "end_resumable_chunk=" << end_chunk;
-  BlenderSession::num_resumable_chunks = num_chunks;
-  BlenderSession::start_resumable_chunk = start_chunk;
-  BlenderSession::end_resumable_chunk = end_chunk;
-
-  printf("Cycles: Will render chunks %d to %d of %d\n", start_chunk, end_chunk, num_chunks);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *clear_resumable_chunk_func(PyObject * /*self*/, PyObject * /*value*/)
-{
-  VLOG(1) << "Clear resumable render";
-  BlenderSession::num_resumable_chunks = 0;
-  BlenderSession::current_resumable_chunk = 0;
-
-  Py_RETURN_NONE;
-}
-
 static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   BlenderSession::print_render_stats = true;
@@ -1015,16 +911,14 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false, has_opencl = false;
+  bool has_cuda = false, has_optix = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
     has_optix |= (device_type == DEVICE_OPTIX);
-    has_opencl |= (device_type == DEVICE_OPENCL);
   }
-  PyObject *list = PyTuple_New(3);
+  PyObject *list = PyTuple_New(2);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
   PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
-  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
   return list;
 }
 
@@ -1044,9 +938,6 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
   if (override == "CPU") {
     BlenderSession::device_override = DEVICE_MASK_CPU;
   }
-  else if (override == "OPENCL") {
-    BlenderSession::device_override = DEVICE_MASK_OPENCL;
-  }
   else if (override == "CUDA") {
     BlenderSession::device_override = DEVICE_MASK_CUDA;
   }
@@ -1072,8 +963,10 @@ static PyMethodDef methods[] = {
     {"create", create_func, METH_VARARGS, ""},
     {"free", free_func, METH_O, ""},
     {"render", render_func, METH_VARARGS, ""},
-    {"bake", bake_func, METH_VARARGS, ""},
+    {"render_frame_finish", render_frame_finish_func, METH_VARARGS, ""},
     {"draw", draw_func, METH_VARARGS, ""},
+    {"bake", bake_func, METH_VARARGS, ""},
+    {"view_draw", view_draw_func, METH_VARARGS, ""},
     {"sync", sync_func, METH_VARARGS, ""},
     {"reset", reset_func, METH_VARARGS, ""},
 #ifdef WITH_OSL
@@ -1082,10 +975,6 @@ static PyMethodDef methods[] = {
 #endif
     {"available_devices", available_devices_func, METH_VARARGS, ""},
     {"system_info", system_info_func, METH_NOARGS, ""},
-#ifdef WITH_OPENCL
-    {"opencl_disable", opencl_disable_func, METH_NOARGS, ""},
-    {"opencl_compile", opencl_compile_func, METH_VARARGS, ""},
-#endif
 
     /* Standalone denoising */
     {"denoise", (PyCFunction)denoise_func, METH_VARARGS | METH_KEYWORDS, ""},
@@ -1098,11 +987,6 @@ static PyMethodDef methods[] = {
     /* Statistics. */
     {"enable_print_stats", enable_print_stats_func, METH_NOARGS, ""},
 
-    /* Resumable render */
-    {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
-    {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
-    {"clear_resumable_chunk", clear_resumable_chunk_func, METH_NOARGS, ""},
-
     /* Compute Device selection */
     {"get_device_types", get_device_types_func, METH_VARARGS, ""},
     {"set_device_override", set_device_override_func, METH_O, ""},
@@ -1153,14 +1037,6 @@ void *CCL_python_module_init()
   PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
 #endif
 
-#ifdef WITH_NETWORK
-  PyModule_AddObject(mod, "with_network", Py_True);
-  Py_INCREF(Py_True);
-#else  /* WITH_NETWORK */
-  PyModule_AddObject(mod, "with_network", Py_False);
-  Py_INCREF(Py_False);
-#endif /* WITH_NETWORK */
-
 #ifdef WITH_EMBREE
   PyModule_AddObject(mod, "with_embree", Py_True);
   Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 29de886e4ff..5aafa605526 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -38,9 +38,11 @@
 #include "util/util_hash.h"
 #include "util/util_logging.h"
 #include "util/util_murmurhash.h"
+#include "util/util_path.h"
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
+#include "blender/blender_gpu_display.h"
 #include "blender/blender_session.h"
 #include "blender/blender_sync.h"
 #include "blender/blender_util.h"
@@ -49,10 +51,6 @@ CCL_NAMESPACE_BEGIN
 
 DeviceTypeMask BlenderSession::device_override = DEVICE_MASK_ALL;
 bool BlenderSession::headless = false;
-int BlenderSession::num_resumable_chunks = 0;
-int BlenderSession::current_resumable_chunk = 0;
-int BlenderSession::start_resumable_chunk = 0;
-int BlenderSession::end_resumable_chunk = 0;
 bool BlenderSession::print_render_stats = false;
 
 BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
@@ -103,7 +101,9 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
       width(width),
       height(height),
       preview_osl(false),
-      python_thread_state(NULL)
+      python_thread_state(NULL),
+      use_developer_ui(b_userpref.experimental().use_cycles_debug() &&
+                       b_userpref.view().show_developer_ui())
 {
   /* 3d view render */
   background = false;
@@ -119,10 +119,10 @@ BlenderSession::~BlenderSession()
 
 void BlenderSession::create_session()
 {
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
-  bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
   /* reset status/progress */
   last_status = "";
@@ -131,20 +131,18 @@ void BlenderSession::create_session()
   start_resize_time = 0.0;
 
   /* create session */
-  session = new Session(session_params);
-  session->scene = scene;
+  session = new Session(session_params, scene_params);
   session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this));
   session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this));
   session->set_pause(session_pause);
 
   /* create scene */
-  scene = new Scene(scene_params, session->device);
+  scene = session->scene;
   scene->name = b_scene.name();
 
-  session->scene = scene;
-
   /* create sync */
-  sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+  sync = new BlenderSync(
+      b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
   BL::Object b_camera_override(b_engine.camera_override());
   if (b_v3d) {
     sync->sync_view(b_v3d, b_rv3d, width, height);
@@ -154,13 +152,23 @@ void BlenderSession::create_session()
   }
 
   /* set buffer parameters */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-  session->reset(buffer_params, session_params.samples);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_v3d, b_rv3d, scene->camera, width, height);
+  session->reset(session_params, buffer_params);
 
-  b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+  /* Create GPU display. */
+  if (!b_engine.is_preview() && !headless) {
+    session->set_gpu_display(make_unique<BlenderGPUDisplay>(b_engine, b_scene));
+  }
 
-  update_resumable_tile_manager(session_params.samples);
+  /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform
+   * engine that no tracking of the tiles state is needed.
+   * The offline rendering will make a decision when tile is being written. The penalty of asking
+   * the engine to keep track of tiles state is minimal, so there is nothing to worry about here
+   * about possible single-tiled final render. */
+  if (!b_engine.is_preview() && !b_v3d) {
+    b_engine.use_highlight_tiles(true);
+  }
 }
 
 void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsgraph)
@@ -202,9 +210,9 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
     return;
   }
 
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 
   if (scene->params.modified(scene_params) || session->params.modified(session_params) ||
       !this->b_render.use_persistent_data()) {
@@ -220,8 +228,6 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   session->progress.reset();
 
-  session->tile_manager.set_tile_order(session_params.tile_order);
-
   /* peak memory usage should show current render peak, not peak for all renders
    * made by this render session
    */
@@ -230,7 +236,8 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
   if (is_new_session) {
     /* Sync object should be re-created for new scene. */
     delete sync;
-    sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+    sync = new BlenderSync(
+        b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
   }
   else {
     /* Sync recalculations to do just the required updates. */
@@ -242,103 +249,85 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
   BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
-  BufferParams buffer_params = BlenderSync::get_buffer_params(b_null_space_view3d,
-                                                              b_null_region_view3d,
-                                                              scene->camera,
-                                                              width,
-                                                              height,
-                                                              session_params.denoising.use);
-  session->reset(buffer_params, session_params.samples);
-
-  b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+  session->reset(session_params, buffer_params);
 
   /* reset time */
   start_resize_time = 0.0;
+
+  {
+    thread_scoped_lock lock(draw_state_.mutex);
+    draw_state_.last_pass_index = -1;
+  }
 }
 
 void BlenderSession::free_session()
 {
-  session->cancel();
+  if (session) {
+    session->cancel(true);
+  }
 
   delete sync;
+  sync = nullptr;
+
   delete session;
+  session = nullptr;
 }
 
-static ShaderEvalType get_shader_type(const string &pass_type)
+void BlenderSession::read_render_tile()
 {
-  const char *shader_type = pass_type.c_str();
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
 
-  /* data passes */
-  if (strcmp(shader_type, "NORMAL") == 0)
-    return SHADER_EVAL_NORMAL;
-  else if (strcmp(shader_type, "UV") == 0)
-    return SHADER_EVAL_UV;
-  else if (strcmp(shader_type, "ROUGHNESS") == 0)
-    return SHADER_EVAL_ROUGHNESS;
-  else if (strcmp(shader_type, "DIFFUSE_COLOR") == 0)
-    return SHADER_EVAL_DIFFUSE_COLOR;
-  else if (strcmp(shader_type, "GLOSSY_COLOR") == 0)
-    return SHADER_EVAL_GLOSSY_COLOR;
-  else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0)
-    return SHADER_EVAL_TRANSMISSION_COLOR;
-  else if (strcmp(shader_type, "EMIT") == 0)
-    return SHADER_EVAL_EMISSION;
+  /* get render result */
+  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+                                                tile_offset.y,
+                                                tile_size.x,
+                                                tile_size.y,
+                                                b_rlay_name.c_str(),
+                                                b_rview_name.c_str());
 
-  /* light passes */
-  else if (strcmp(shader_type, "AO") == 0)
-    return SHADER_EVAL_AO;
-  else if (strcmp(shader_type, "COMBINED") == 0)
-    return SHADER_EVAL_COMBINED;
-  else if (strcmp(shader_type, "SHADOW") == 0)
-    return SHADER_EVAL_SHADOW;
-  else if (strcmp(shader_type, "DIFFUSE") == 0)
-    return SHADER_EVAL_DIFFUSE;
-  else if (strcmp(shader_type, "GLOSSY") == 0)
-    return SHADER_EVAL_GLOSSY;
-  else if (strcmp(shader_type, "TRANSMISSION") == 0)
-    return SHADER_EVAL_TRANSMISSION;
+  /* can happen if the intersected rectangle gives 0 width or height */
+  if (b_rr.ptr.data == NULL) {
+    return;
+  }
 
-  /* extra */
-  else if (strcmp(shader_type, "ENVIRONMENT") == 0)
-    return SHADER_EVAL_ENVIRONMENT;
+  BL::RenderResult::layers_iterator b_single_rlay;
+  b_rr.layers.begin(b_single_rlay);
 
-  else
-    return SHADER_EVAL_BAKE;
-}
+  /* layer will be missing if it was disabled in the UI */
+  if (b_single_rlay == b_rr.layers.end())
+    return;
 
-static BL::RenderResult begin_render_result(BL::RenderEngine &b_engine,
-                                            int x,
-                                            int y,
-                                            int w,
-                                            int h,
-                                            const char *layername,
-                                            const char *viewname)
-{
-  return b_engine.begin_result(x, y, w, h, layername, viewname);
-}
+  BL::RenderLayer b_rlay = *b_single_rlay;
 
-static void end_render_result(BL::RenderEngine &b_engine,
-                              BL::RenderResult &b_rr,
-                              bool cancel,
-                              bool highlight,
-                              bool do_merge_results)
-{
-  b_engine.end_result(b_rr, (int)cancel, (int)highlight, (int)do_merge_results);
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+  /* Copy each pass.
+   * TODO:copy only the required ones for better performance? */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
+  }
 }
 
-void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
-                                                 bool do_update_only,
-                                                 bool do_read_only,
-                                                 bool highlight)
+void BlenderSession::write_render_tile()
 {
-  int x = rtile.x - session->tile_manager.params.full_x;
-  int y = rtile.y - session->tile_manager.params.full_y;
-  int w = rtile.w;
-  int h = rtile.h;
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
+
+  const string_view render_layer_name = session->get_render_tile_layer();
+  const string_view render_view_name = session->get_render_tile_view();
+
+  b_engine.tile_highlight_clear_all();
 
   /* get render result */
-  BL::RenderResult b_rr = begin_render_result(
-      b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
+  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+                                                tile_offset.y,
+                                                tile_size.x,
+                                                tile_size.y,
+                                                render_layer_name.c_str(),
+                                                render_view_name.c_str());
 
   /* can happen if the intersected rectangle gives 0 width or height */
   if (b_rr.ptr.data == NULL) {
@@ -349,64 +338,34 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
   b_rr.layers.begin(b_single_rlay);
 
   /* layer will be missing if it was disabled in the UI */
-  if (b_single_rlay == b_rr.layers.end())
+  if (b_single_rlay == b_rr.layers.end()) {
     return;
+  }
 
   BL::RenderLayer b_rlay = *b_single_rlay;
 
-  if (do_read_only) {
-    /* copy each pass */
-    for (BL::RenderPass &b_pass : b_rlay.passes) {
-      /* find matching pass type */
-      PassType pass_type = BlenderSync::get_pass_type(b_pass);
-      int components = b_pass.channels();
-
-      rtile.buffers->set_pass_rect(
-          pass_type, components, (float *)b_pass.rect(), rtile.num_samples);
-    }
-
-    end_render_result(b_engine, b_rr, false, false, false);
-  }
-  else if (do_update_only) {
-    /* Sample would be zero at initial tile update, which is only needed
-     * to tag tile form blender side as IN PROGRESS for proper highlight
-     * no buffers should be sent to blender yet. For denoise we also
-     * keep showing the noisy buffers until denoise is done. */
-    bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE);
+  write_render_result(b_rlay);
 
-    if (merge) {
-      update_render_result(b_rlay, rtile);
-    }
-
-    end_render_result(b_engine, b_rr, true, highlight, merge);
-  }
-  else {
-    /* Write final render result. */
-    write_render_result(b_rlay, rtile);
-    end_render_result(b_engine, b_rr, false, false, true);
-  }
+  b_engine.end_result(b_rr, true, false, true);
 }
 
-void BlenderSession::read_render_tile(RenderTile &rtile)
+void BlenderSession::update_render_tile()
 {
-  do_write_update_render_tile(rtile, false, true, false);
-}
+  if (!session->has_multiple_render_tiles()) {
+    /* Don't highlight full-frame tile. */
+    return;
+  }
 
-void BlenderSession::write_render_tile(RenderTile &rtile)
-{
-  do_write_update_render_tile(rtile, false, false, false);
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
+
+  b_engine.tile_highlight_clear_all();
+  b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true);
 }
 
-void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight)
+void BlenderSession::full_buffer_written(string_view filename)
 {
-  /* use final write for preview renders, otherwise render result wouldn't be
-   * be updated in blender side
-   * would need to be investigated a bit further, but for now shall be fine
-   */
-  if (!b_engine.is_preview())
-    do_write_update_render_tile(rtile, true, false, highlight);
-  else
-    do_write_update_render_tile(rtile, false, false, false);
+  full_buffer_files_.emplace_back(filename);
 }
 
 static void add_cryptomatte_layer(BL::RenderResult &b_rr, string name, string manifest)
@@ -430,12 +389,15 @@ void BlenderSession::stamp_view_layer_metadata(Scene *scene, const string &view_
                             to_string(session->params.samples).c_str());
 
   /* Store ranged samples information. */
+  /* TODO(sergey): Need to bring this information back. */
+#if 0
   if (session->tile_manager.range_num_samples != -1) {
     b_rr.stamp_data_add_field((prefix + "range_start_sample").c_str(),
                               to_string(session->tile_manager.range_start_sample).c_str());
     b_rr.stamp_data_add_field((prefix + "range_num_samples").c_str(),
                               to_string(session->tile_manager.range_num_samples).c_str());
   }
+#endif
 
   /* Write cryptomatte metadata. */
   if (scene->film->get_cryptomatte_passes() & CRYPT_OBJECT) {
@@ -475,38 +437,44 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   }
 
   /* set callback to write out render results */
-  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-  session->update_render_tile_cb = function_bind(
-      &BlenderSession::update_render_tile, this, _1, _2);
+  session->write_render_tile_cb = [&]() { write_render_tile(); };
+
+  /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender
+   * side. */
+  /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */
+  if (b_engine.is_preview()) {
+    session->update_render_tile_cb = [&]() { write_render_tile(); };
+  }
+  else {
+    session->update_render_tile_cb = [&]() { update_render_tile(); };
+  }
+
+  session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); };
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
   /* get buffer parameters */
-  SessionParams session_params = BlenderSync::get_session_params(
-      b_engine, b_userpref, b_scene, background, b_view_layer);
+  const SessionParams session_params = BlenderSync::get_session_params(
+      b_engine, b_userpref, b_scene, background);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
+      b_v3d, b_rv3d, scene->camera, width, height);
 
   /* temporary render result to find needed passes and views */
-  BL::RenderResult b_rr = begin_render_result(
-      b_engine, 0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
+  BL::RenderResult b_rr = b_engine.begin_result(0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
   BL::RenderResult::layers_iterator b_single_rlay;
   b_rr.layers.begin(b_single_rlay);
   BL::RenderLayer b_rlay = *b_single_rlay;
-  b_rlay_name = b_view_layer.name();
 
-  /* Update denoising parameters. */
-  session->set_denoising(session_params.denoising);
+  {
+    thread_scoped_lock lock(draw_state_.mutex);
+    b_rlay_name = b_view_layer.name();
 
-  /* Compute render passes and film settings. */
-  vector<Pass> passes = sync->sync_render_passes(
-      b_scene, b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising);
+    /* Signal that the display pass is to be updated. */
+    draw_state_.last_pass_index = -1;
+  }
 
-  /* Set buffer params, using film settings from sync_render_passes. */
-  buffer_params.passes = passes;
-  buffer_params.denoising_data_pass = scene->film->get_denoising_data_pass();
-  buffer_params.denoising_clean_pass = scene->film->get_denoising_clean_pass();
-  buffer_params.denoising_prefiltered_pass = scene->film->get_denoising_prefiltered_pass();
+  /* Compute render passes and film settings. */
+  sync->sync_render_passes(b_rlay, b_view_layer);
 
   BL::RenderResult::views_iterator b_view_iter;
 
@@ -520,6 +488,9 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
        ++b_view_iter, ++view_index) {
     b_rview_name = b_view_iter->name();
 
+    buffer_params.layer = b_view_layer.name();
+    buffer_params.view = b_rview_name;
+
     /* set the current view */
     b_engine.active_view_set(b_rview_name.c_str());
 
@@ -549,20 +520,16 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     }
 
     /* Update number of samples per layer. */
-    int samples = sync->get_layer_samples();
-    bool bound_samples = sync->get_layer_bound_samples();
-    int effective_layer_samples;
+    const int samples = sync->get_layer_samples();
+    const bool bound_samples = sync->get_layer_bound_samples();
 
-    if (samples != 0 && (!bound_samples || (samples < session_params.samples)))
-      effective_layer_samples = samples;
-    else
-      effective_layer_samples = session_params.samples;
-
-    /* Update tile manager if we're doing resumable render. */
-    update_resumable_tile_manager(effective_layer_samples);
+    SessionParams effective_session_params = session_params;
+    if (samples != 0 && (!bound_samples || (samples < session_params.samples))) {
+      effective_session_params.samples = samples;
+    }
 
     /* Update session itself. */
-    session->reset(buffer_params, effective_layer_samples);
+    session->reset(effective_session_params, buffer_params);
 
     /* render */
     if (!b_engine.is_preview() && background && print_render_stats) {
@@ -586,65 +553,146 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   stamp_view_layer_metadata(scene, b_rlay_name);
 
   /* free result without merging */
-  end_render_result(b_engine, b_rr, true, true, false);
+  b_engine.end_result(b_rr, true, false, false);
 
   double total_time, render_time;
   session->progress.get_time(total_time, render_time);
   VLOG(1) << "Total render time: " << total_time;
   VLOG(1) << "Render time (without synchronization): " << render_time;
+}
+
+void BlenderSession::render_frame_finish()
+{
+  /* Processing of all layers and views is done. Clear the strings so that we can communicate
+   * progress about reading files and denoising them. */
+  b_rlay_name = "";
+  b_rview_name = "";
+
+  if (!b_render.use_persistent_data()) {
+    /* Free the sync object so that it can properly dereference nodes from the scene graph before
+     * the graph is freed. */
+    delete sync;
+    sync = nullptr;
+
+    session->device_free();
+  }
+
+  for (string_view filename : full_buffer_files_) {
+    session->process_full_buffer_from_disk(filename);
+    path_remove(filename);
+  }
 
   /* clear callback */
   session->write_render_tile_cb = function_null;
   session->update_render_tile_cb = function_null;
+  session->full_buffer_written_cb = function_null;
 }
 
-static int bake_pass_filter_get(const int pass_filter)
+static PassType bake_type_to_pass(const string &bake_type_str, const int bake_filter)
 {
-  int flag = BAKE_FILTER_NONE;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_DIRECT) != 0)
-    flag |= BAKE_FILTER_DIRECT;
-  if ((pass_filter & BL::BakeSettings::pass_filter_INDIRECT) != 0)
-    flag |= BAKE_FILTER_INDIRECT;
-  if ((pass_filter & BL::BakeSettings::pass_filter_COLOR) != 0)
-    flag |= BAKE_FILTER_COLOR;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0)
-    flag |= BAKE_FILTER_DIFFUSE;
-  if ((pass_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0)
-    flag |= BAKE_FILTER_GLOSSY;
-  if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0)
-    flag |= BAKE_FILTER_TRANSMISSION;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0)
-    flag |= BAKE_FILTER_EMISSION;
-  if ((pass_filter & BL::BakeSettings::pass_filter_AO) != 0)
-    flag |= BAKE_FILTER_AO;
-
-  return flag;
+  const char *bake_type = bake_type_str.c_str();
+
+  /* data passes */
+  if (strcmp(bake_type, "POSITION") == 0) {
+    return PASS_POSITION;
+  }
+  else if (strcmp(bake_type, "NORMAL") == 0) {
+    return PASS_NORMAL;
+  }
+  else if (strcmp(bake_type, "UV") == 0) {
+    return PASS_UV;
+  }
+  else if (strcmp(bake_type, "ROUGHNESS") == 0) {
+    return PASS_ROUGHNESS;
+  }
+  else if (strcmp(bake_type, "EMIT") == 0) {
+    return PASS_EMISSION;
+  }
+  /* light passes */
+  else if (strcmp(bake_type, "AO") == 0) {
+    return PASS_AO;
+  }
+  else if (strcmp(bake_type, "COMBINED") == 0) {
+    return PASS_COMBINED;
+  }
+  else if (strcmp(bake_type, "SHADOW") == 0) {
+    return PASS_SHADOW;
+  }
+  else if (strcmp(bake_type, "DIFFUSE") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_DIFFUSE;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_DIFFUSE_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_DIFFUSE_INDIRECT;
+    }
+    else {
+      return PASS_DIFFUSE_COLOR;
+    }
+  }
+  else if (strcmp(bake_type, "GLOSSY") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_GLOSSY;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_GLOSSY_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_GLOSSY_INDIRECT;
+    }
+    else {
+      return PASS_GLOSSY_COLOR;
+    }
+  }
+  else if (strcmp(bake_type, "TRANSMISSION") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_TRANSMISSION;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_TRANSMISSION_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_TRANSMISSION_INDIRECT;
+    }
+    else {
+      return PASS_TRANSMISSION_COLOR;
+    }
+  }
+  /* extra */
+  else if (strcmp(bake_type, "ENVIRONMENT") == 0) {
+    return PASS_BACKGROUND;
+  }
+
+  return PASS_COMBINED;
 }
 
 void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
                           BL::Object &b_object,
-                          const string &pass_type,
-                          const int pass_filter,
+                          const string &bake_type,
+                          const int bake_filter,
                           const int bake_width,
                           const int bake_height)
 {
   b_depsgraph = b_depsgraph_;
 
-  ShaderEvalType shader_type = get_shader_type(pass_type);
-  int bake_pass_filter = bake_pass_filter_get(pass_filter);
-
   /* Initialize bake manager, before we load the baking kernels. */
-  scene->bake_manager->set(scene, b_object.name(), shader_type, bake_pass_filter);
+  scene->bake_manager->set(scene, b_object.name());
 
-  /* Passes are identified by name, so in order to return the combined pass we need to set the
-   * name. */
-  Pass::add(PASS_COMBINED, scene->passes, "Combined");
+  /* Add render pass that we want to bake, and name it Combined so that it is
+   * used as that on the Blender side. */
+  Pass *pass = scene->create_node<Pass>();
+  pass->set_name(ustring("Combined"));
+  pass->set_type(bake_type_to_pass(bake_type, bake_filter));
+  pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));
 
-  session->read_bake_tile_cb = function_bind(&BlenderSession::read_render_tile, this, _1);
-  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
+  session->read_render_tile_cb = [&]() { read_render_tile(); };
+  session->write_render_tile_cb = [&]() { write_render_tile(); };
+  session->set_gpu_display(nullptr);
 
   if (!session->progress.get_cancel()) {
     /* Sync scene. */
@@ -667,18 +715,15 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
 
   if (object_found && !session->progress.get_cancel()) {
     /* Get session and buffer parameters. */
-    SessionParams session_params = BlenderSync::get_session_params(
+    const SessionParams session_params = BlenderSync::get_session_params(
         b_engine, b_userpref, b_scene, background);
-    session_params.progressive_refine = false;
 
     BufferParams buffer_params;
     buffer_params.width = bake_width;
     buffer_params.height = bake_height;
-    buffer_params.passes = scene->passes;
 
     /* Update session. */
-    session->tile_manager.set_samples(session_params.samples);
-    session->reset(buffer_params, session_params.samples);
+    session->reset(session_params, buffer_params);
 
     session->progress.set_update_callback(
         function_bind(&BlenderSession::update_bake_progress, this));
@@ -690,71 +735,43 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     session->wait();
   }
 
-  session->read_bake_tile_cb = function_null;
+  session->read_render_tile_cb = function_null;
   session->write_render_tile_cb = function_null;
 }
 
-void BlenderSession::do_write_update_render_result(BL::RenderLayer &b_rlay,
-                                                   RenderTile &rtile,
-                                                   bool do_update_only)
+void BlenderSession::write_render_result(BL::RenderLayer &b_rlay)
 {
-  RenderBuffers *buffers = rtile.buffers;
-
-  /* copy data from device */
-  if (!buffers->copy_from_device())
+  if (!session->copy_render_tile_from_device()) {
     return;
-
-  float exposure = scene->film->get_exposure();
-
-  vector<float> pixels(rtile.w * rtile.h * 4);
-
-  /* Adjust absolute sample number to the range. */
-  int sample = rtile.sample;
-  const int range_start_sample = session->tile_manager.range_start_sample;
-  if (range_start_sample != -1) {
-    sample -= range_start_sample;
   }
 
-  if (!do_update_only) {
-    /* copy each pass */
-    for (BL::RenderPass &b_pass : b_rlay.passes) {
-      int components = b_pass.channels();
-
-      /* Copy pixels from regular render passes. */
-      bool read = buffers->get_pass_rect(b_pass.name(), exposure, sample, components, &pixels[0]);
-
-      /* If denoising pass, */
-      if (!read) {
-        int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
-        if (denoising_offset >= 0) {
-          read = buffers->get_denoising_pass_rect(
-              denoising_offset, exposure, sample, components, &pixels[0]);
-        }
-      }
+  const int2 tile_size = session->get_render_tile_size();
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
 
-      if (!read) {
-        memset(&pixels[0], 0, pixels.size() * sizeof(float));
-      }
-
-      b_pass.rect(&pixels[0]);
+  /* Copy each pass. */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) {
+      memset(&pixels[0], 0, pixels.size() * sizeof(float));
     }
-  }
-  else {
-    /* copy combined pass */
-    BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
-    if (buffers->get_pass_rect("Combined", exposure, sample, 4, &pixels[0]))
-      b_combined_pass.rect(&pixels[0]);
+
+    b_pass.rect(&pixels[0]);
   }
 }
 
-void BlenderSession::write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
+void BlenderSession::update_render_result(BL::RenderLayer &b_rlay)
 {
-  do_write_update_render_result(b_rlay, rtile, false);
-}
+  if (!session->copy_render_tile_from_device()) {
+    return;
+  }
 
-void BlenderSession::update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
-{
-  do_write_update_render_result(b_rlay, rtile, true);
+  const int2 tile_size = session->get_render_tile_size();
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+  /* Copy combined pass. */
+  BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
+  if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) {
+    b_combined_pass.rect(&pixels[0]);
+  }
 }
 
 void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
@@ -764,19 +781,19 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
     return;
 
   /* on session/scene parameter changes, we recreate session entirely */
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
-  bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
   if (session->params.modified(session_params) || scene->params.modified(scene_params)) {
     free_session();
     create_session();
   }
 
-  /* increase samples, but never decrease */
+  /* increase samples and render time, but never decrease */
   session->set_samples(session_params.samples);
-  session->set_denoising_start_sample(session_params.denoising.start_sample);
+  session->set_time_limit(session_params.time_limit);
   session->set_pause(session_pause);
 
   /* copy recalc flags, outside of mutex so we can decide to do the real
@@ -808,21 +825,12 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
     sync->sync_camera(b_render, b_camera_override, width, height, "");
 
   /* get buffer parameters */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-
-  if (!buffer_params.denoising_data_pass) {
-    session_params.denoising.use = false;
-  }
-
-  session->set_denoising(session_params.denoising);
-
-  /* Update film if denoising data was enabled or disabled. */
-  scene->film->set_denoising_data_pass(buffer_params.denoising_data_pass);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_v3d, b_rv3d, scene->camera, width, height);
 
   /* reset if needed */
   if (scene->need_reset()) {
-    session->reset(buffer_params, session_params.samples);
+    session->reset(session_params, buffer_params);
 
     /* After session reset, so device is not accessing image data anymore. */
     builtin_images_load();
@@ -839,7 +847,41 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   session->start();
 }
 
-bool BlenderSession::draw(int w, int h)
+void BlenderSession::draw(BL::SpaceImageEditor &space_image)
+{
+  if (!session || !session->scene) {
+    /* Offline render drawing does not force the render engine update, which means it's possible
+     * that the Session is not created yet. */
+    return;
+  }
+
+  thread_scoped_lock lock(draw_state_.mutex);
+
+  const int pass_index = space_image.image_user().multilayer_pass();
+  if (pass_index != draw_state_.last_pass_index) {
+    BL::RenderPass b_display_pass(b_engine.pass_by_index_get(b_rlay_name.c_str(), pass_index));
+    if (!b_display_pass) {
+      return;
+    }
+
+    Scene *scene = session->scene;
+
+    thread_scoped_lock lock(scene->mutex);
+
+    const Pass *pass = Pass::find(scene->passes, b_display_pass.name());
+    if (!pass) {
+      return;
+    }
+
+    scene->film->set_display_pass(pass->get_type());
+
+    draw_state_.last_pass_index = pass_index;
+  }
+
+  session->draw();
+}
+
+void BlenderSession::view_draw(int w, int h)
 {
   /* pause in redraw in case update is not being called due to final render */
   session->set_pause(BlenderSync::get_session_pause(b_scene, background));
@@ -885,14 +927,14 @@ bool BlenderSession::draw(int w, int h)
 
     /* reset if requested */
     if (reset) {
-      SessionParams session_params = BlenderSync::get_session_params(
+      const SessionParams session_params = BlenderSync::get_session_params(
           b_engine, b_userpref, b_scene, background);
-      BufferParams buffer_params = BlenderSync::get_buffer_params(
-          b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-      bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+      const BufferParams buffer_params = BlenderSync::get_buffer_params(
+          b_v3d, b_rv3d, scene->camera, width, height);
+      const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
       if (session_pause == false) {
-        session->reset(buffer_params, session_params.samples);
+        session->reset(session_params, buffer_params);
         start_resize_time = 0.0;
       }
     }
@@ -905,18 +947,7 @@ bool BlenderSession::draw(int w, int h)
   update_status_progress();
 
   /* draw */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use);
-  DeviceDrawParams draw_params;
-
-  if (session->params.display_buffer_linear) {
-    draw_params.bind_display_space_shader_cb = function_bind(
-        &BL::RenderEngine::bind_display_space_shader, &b_engine, b_scene);
-    draw_params.unbind_display_space_shader_cb = function_bind(
-        &BL::RenderEngine::unbind_display_space_shader, &b_engine);
-  }
-
-  return !session->draw(buffer_params, draw_params);
+  session->draw();
 }
 
 void BlenderSession::get_status(string &status, string &substatus)
@@ -924,11 +955,6 @@ void BlenderSession::get_status(string &status, string &substatus)
   session->progress.get_status(status, substatus);
 }
 
-void BlenderSession::get_kernel_status(string &kernel_status)
-{
-  session->progress.get_kernel_status(kernel_status);
-}
-
 void BlenderSession::get_progress(float &progress, double &total_time, double &render_time)
 {
   session->progress.get_time(total_time, render_time);
@@ -947,7 +973,7 @@ void BlenderSession::update_bake_progress()
 
 void BlenderSession::update_status_progress()
 {
-  string timestatus, status, substatus, kernel_status;
+  string timestatus, status, substatus;
   string scene_status = "";
   float progress;
   double total_time, remaining_time = 0, render_time;
@@ -955,7 +981,6 @@ void BlenderSession::update_status_progress()
   float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
 
   get_status(status, substatus);
-  get_kernel_status(kernel_status);
   get_progress(progress, total_time, render_time);
 
   if (progress > 0)
@@ -980,14 +1005,12 @@ void BlenderSession::update_status_progress()
       status = " | " + status;
     if (substatus.size() > 0)
       status += " | " + substatus;
-    if (kernel_status.size() > 0)
-      status += " | " + kernel_status;
   }
 
   double current_time = time_dt();
-  /* When rendering in a window, redraw the status at least once per second to keep the elapsed and
-   * remaining time up-to-date. For headless rendering, only report when something significant
-   * changes to keep the console output readable. */
+  /* When rendering in a window, redraw the status at least once per second to keep the elapsed
+   * and remaining time up-to-date. For headless rendering, only report when something
+   * significant changes to keep the console output readable. */
   if (status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
     b_engine.update_stats("", (timestatus + scene_status + status).c_str());
     b_engine.update_memory_stats(mem_used, mem_peak);
@@ -1048,56 +1071,6 @@ void BlenderSession::test_cancel()
       session->progress.set_cancel("Cancelled");
 }
 
-void BlenderSession::update_resumable_tile_manager(int num_samples)
-{
-  const int num_resumable_chunks = BlenderSession::num_resumable_chunks,
-            current_resumable_chunk = BlenderSession::current_resumable_chunk;
-  if (num_resumable_chunks == 0) {
-    return;
-  }
-
-  if (num_resumable_chunks > num_samples) {
-    fprintf(stderr,
-            "Cycles warning: more sample chunks (%d) than samples (%d), "
-            "this will cause some samples to be included in multiple chunks.\n",
-            num_resumable_chunks,
-            num_samples);
-  }
-
-  const float num_samples_per_chunk = (float)num_samples / num_resumable_chunks;
-
-  float range_start_sample, range_num_samples;
-  if (current_resumable_chunk != 0) {
-    /* Single chunk rendering. */
-    range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-    range_num_samples = num_samples_per_chunk;
-  }
-  else {
-    /* Ranged-chunks. */
-    const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
-    range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
-    range_num_samples = num_chunks * num_samples_per_chunk;
-  }
-
-  /* Round after doing the multiplications with num_chunks and num_samples_per_chunk
-   * to allow for many small chunks. */
-  int rounded_range_start_sample = (int)floorf(range_start_sample + 0.5f);
-  int rounded_range_num_samples = max((int)floorf(range_num_samples + 0.5f), 1);
-
-  /* Make sure we don't overshoot. */
-  if (rounded_range_start_sample + rounded_range_num_samples > num_samples) {
-    rounded_range_num_samples = num_samples - rounded_range_num_samples;
-  }
-
-  VLOG(1) << "Samples range start is " << range_start_sample << ", "
-          << "number of samples to render is " << range_num_samples;
-
-  scene->integrator->set_start_sample(rounded_range_start_sample);
-
-  session->tile_manager.range_start_sample = rounded_range_start_sample;
-  session->tile_manager.range_num_samples = rounded_range_num_samples;
-}
-
 void BlenderSession::free_blender_memory_if_possible()
 {
   if (!background) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index d967b81c854..cf52359ea5d 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -33,8 +33,6 @@ class BlenderSync;
 class ImageMetaData;
 class Scene;
 class Session;
-class RenderBuffers;
-class RenderTile;
 
 class BlenderSession {
  public:
@@ -62,6 +60,8 @@ class BlenderSession {
   /* offline render */
   void render(BL::Depsgraph &b_depsgraph);
 
+  void render_frame_finish();
+
   void bake(BL::Depsgraph &b_depsgrah,
             BL::Object &b_object,
             const string &pass_type,
@@ -69,24 +69,29 @@ class BlenderSession {
             const int bake_width,
             const int bake_height);
 
-  void write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
-  void write_render_tile(RenderTile &rtile);
-  void read_render_tile(RenderTile &rtile);
+  void write_render_result(BL::RenderLayer &b_rlay);
+  void write_render_tile();
+
+  void update_render_tile();
+
+  void full_buffer_written(string_view filename);
 
   /* update functions are used to update display buffer only after sample was rendered
    * only needed for better visual feedback */
-  void update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
-  void update_render_tile(RenderTile &rtile, bool highlight);
+  void update_render_result(BL::RenderLayer &b_rlay);
+
+  /* read functions for baking input */
+  void read_render_tile();
 
   /* interactive updates */
   void synchronize(BL::Depsgraph &b_depsgraph);
 
   /* drawing */
-  bool draw(int w, int h);
+  void draw(BL::SpaceImageEditor &space_image);
+  void view_draw(int w, int h);
   void tag_redraw();
   void tag_update();
   void get_status(string &status, string &substatus);
-  void get_kernel_status(string &kernel_status);
   void get_progress(float &progress, double &total_time, double &render_time);
   void test_cancel();
   void update_status_progress();
@@ -123,6 +128,8 @@ class BlenderSession {
 
   void *python_thread_state;
 
+  bool use_developer_ui;
+
   /* Global state which is common for all render sessions created from Blender.
    * Usually denotes command line arguments.
    */
@@ -134,41 +141,25 @@ class BlenderSession {
    */
   static bool headless;
 
-  /* ** Resumable render ** */
-
-  /* Overall number of chunks in which the sample range is to be divided. */
-  static int num_resumable_chunks;
-
-  /* Current resumable chunk index to render. */
-  static int current_resumable_chunk;
-
-  /* Alternative to single-chunk rendering to render a range of chunks. */
-  static int start_resumable_chunk;
-  static int end_resumable_chunk;
-
   static bool print_render_stats;
 
  protected:
   void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
 
-  void do_write_update_render_result(BL::RenderLayer &b_rlay,
-                                     RenderTile &rtile,
-                                     bool do_update_only);
-  void do_write_update_render_tile(RenderTile &rtile,
-                                   bool do_update_only,
-                                   bool do_read_only,
-                                   bool highlight);
-
   void builtin_images_load();
 
-  /* Update tile manager to reflect resumable render settings. */
-  void update_resumable_tile_manager(int num_samples);
-
   /* Is used after each render layer synchronization is done with the goal
    * of freeing render engine data which is held from Blender side (for
    * example, dependency graph).
    */
   void free_blender_memory_if_possible();
+
+  struct {
+    thread_mutex mutex;
+    int last_pass_index = -1;
+  } draw_state_;
+
+  vector<string> full_buffer_files_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index de7b2761d00..8c4f789ffd0 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -17,6 +17,7 @@
 #include "render/background.h"
 #include "render/colorspace.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/nodes.h"
 #include "render/osl.h"
@@ -475,17 +476,11 @@ static ShaderNode *add_node(Scene *scene,
     SubsurfaceScatteringNode *subsurface = graph->create_node<SubsurfaceScatteringNode>();
 
     switch (b_subsurface_node.falloff()) {
-      case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC:
-        subsurface->set_falloff(CLOSURE_BSSRDF_CUBIC_ID);
-        break;
-      case BL::ShaderNodeSubsurfaceScattering::falloff_GAUSSIAN:
-        subsurface->set_falloff(CLOSURE_BSSRDF_GAUSSIAN_ID);
-        break;
-      case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY:
-        subsurface->set_falloff(CLOSURE_BSSRDF_BURLEY_ID);
+      case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK_FIXED_RADIUS:
+        subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
         break;
       case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK:
-        subsurface->set_falloff(CLOSURE_BSSRDF_RANDOM_WALK_ID);
+        subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
         break;
     }
 
@@ -597,11 +592,11 @@ static ShaderNode *add_node(Scene *scene,
         break;
     }
     switch (b_principled_node.subsurface_method()) {
-      case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY:
-        principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_ID);
+      case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK_FIXED_RADIUS:
+        principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
         break;
       case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK:
-        principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+        principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
         break;
     }
     node = principled;
@@ -1360,10 +1355,11 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
 void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all)
 {
   Background *background = scene->background;
+  Integrator *integrator = scene->integrator;
 
   BL::World b_world = b_scene.world();
 
-  BlenderViewportParameters new_viewport_parameters(b_v3d);
+  BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
 
   if (world_recalc || update_all || b_world.ptr.data != world_map ||
       viewport_parameters.shader_modified(new_viewport_parameters)) {
@@ -1455,9 +1451,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       /* AO */
       BL::WorldLighting b_light = b_world.light_settings();
 
-      background->set_use_ao(b_light.use_ambient_occlusion());
-      background->set_ao_factor(b_light.ao_factor());
-      background->set_ao_distance(b_light.distance());
+      integrator->set_ao_factor(b_light.ao_factor());
+      integrator->set_ao_distance(b_light.distance());
 
       /* visibility */
       PointerRNA cvisibility = RNA_pointer_get(&b_world.ptr, "cycles_visibility");
@@ -1472,9 +1467,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       background->set_visibility(visibility);
     }
     else {
-      background->set_use_ao(false);
-      background->set_ao_factor(0.0f);
-      background->set_ao_distance(FLT_MAX);
+      integrator->set_ao_factor(1.0f);
+      integrator->set_ao_distance(10.0f);
     }
 
     shader->set_graph(graph);
@@ -1496,7 +1490,6 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
 
   background->set_use_shader(view_layer.use_background_shader ||
                              viewport_parameters.use_custom_shader());
-  background->set_use_ao(background->get_use_ao() && view_layer.use_background_ao);
 
   background->tag_update(scene);
 }
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 26d64b7bf85..d6fc7ee1723 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -53,6 +53,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
                          BL::Scene &b_scene,
                          Scene *scene,
                          bool preview,
+                         bool use_developer_ui,
                          Progress &progress)
     : b_engine(b_engine),
       b_data(b_data),
@@ -68,6 +69,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
       scene(scene),
       preview(preview),
       experimental(false),
+      use_developer_ui(use_developer_ui),
       dicing_rate(1.0f),
       max_subdivisions(12),
       progress(progress),
@@ -224,7 +226,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
   }
 
   if (b_v3d) {
-    BlenderViewportParameters new_viewport_parameters(b_v3d);
+    BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
 
     if (viewport_parameters.shader_modified(new_viewport_parameters)) {
       world_recalc = true;
@@ -251,9 +253,13 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
+  /* TODO(sergey): This feels weak to pass view layer to the integrator, and even weaker to have an
+   * implicit check on whether it is a background render or not. What is the nicer thing here? */
+  const bool background = !b_v3d;
+
   sync_view_layer(b_view_layer);
-  sync_integrator();
-  sync_film(b_v3d);
+  sync_integrator(b_view_layer, background);
+  sync_film(b_view_layer, b_v3d);
   sync_shaders(b_depsgraph, b_v3d);
   sync_images();
 
@@ -280,7 +286,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
 /* Integrator */
 
-void BlenderSync::sync_integrator()
+void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
@@ -328,59 +334,24 @@ void BlenderSync::sync_integrator()
     integrator->set_motion_blur(view_layer.use_motion_blur);
   }
 
-  integrator->set_method((Integrator::Method)get_enum(
-      cscene, "progressive", Integrator::NUM_METHODS, Integrator::PATH));
-
-  integrator->set_sample_all_lights_direct(get_boolean(cscene, "sample_all_lights_direct"));
-  integrator->set_sample_all_lights_indirect(get_boolean(cscene, "sample_all_lights_indirect"));
   integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));
 
   SamplingPattern sampling_pattern = (SamplingPattern)get_enum(
       cscene, "sampling_pattern", SAMPLING_NUM_PATTERNS, SAMPLING_PATTERN_SOBOL);
-
-  int adaptive_min_samples = INT_MAX;
-
-  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
-    sampling_pattern = SAMPLING_PATTERN_PMJ;
-    adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
-    integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
-  }
-  else {
-    integrator->set_adaptive_threshold(0.0f);
-  }
-
   integrator->set_sampling_pattern(sampling_pattern);
 
-  int diffuse_samples = get_int(cscene, "diffuse_samples");
-  int glossy_samples = get_int(cscene, "glossy_samples");
-  int transmission_samples = get_int(cscene, "transmission_samples");
-  int ao_samples = get_int(cscene, "ao_samples");
-  int mesh_light_samples = get_int(cscene, "mesh_light_samples");
-  int subsurface_samples = get_int(cscene, "subsurface_samples");
-  int volume_samples = get_int(cscene, "volume_samples");
-
-  if (get_boolean(cscene, "use_square_samples")) {
-    integrator->set_diffuse_samples(diffuse_samples * diffuse_samples);
-    integrator->set_glossy_samples(glossy_samples * glossy_samples);
-    integrator->set_transmission_samples(transmission_samples * transmission_samples);
-    integrator->set_ao_samples(ao_samples * ao_samples);
-    integrator->set_mesh_light_samples(mesh_light_samples * mesh_light_samples);
-    integrator->set_subsurface_samples(subsurface_samples * subsurface_samples);
-    integrator->set_volume_samples(volume_samples * volume_samples);
-    adaptive_min_samples = min(adaptive_min_samples * adaptive_min_samples, INT_MAX);
+  if (preview) {
+    integrator->set_use_adaptive_sampling(
+        RNA_boolean_get(&cscene, "use_preview_adaptive_sampling"));
+    integrator->set_adaptive_threshold(get_float(cscene, "preview_adaptive_threshold"));
+    integrator->set_adaptive_min_samples(get_int(cscene, "preview_adaptive_min_samples"));
   }
   else {
-    integrator->set_diffuse_samples(diffuse_samples);
-    integrator->set_glossy_samples(glossy_samples);
-    integrator->set_transmission_samples(transmission_samples);
-    integrator->set_ao_samples(ao_samples);
-    integrator->set_mesh_light_samples(mesh_light_samples);
-    integrator->set_subsurface_samples(subsurface_samples);
-    integrator->set_volume_samples(volume_samples);
+    integrator->set_use_adaptive_sampling(RNA_boolean_get(&cscene, "use_adaptive_sampling"));
+    integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
+    integrator->set_adaptive_min_samples(get_int(cscene, "adaptive_min_samples"));
   }
 
-  integrator->set_adaptive_min_samples(adaptive_min_samples);
-
   if (get_boolean(cscene, "use_fast_gi")) {
     if (preview) {
       integrator->set_ao_bounces(get_int(cscene, "ao_bounces"));
@@ -393,20 +364,38 @@ void BlenderSync::sync_integrator()
     integrator->set_ao_bounces(0);
   }
 
-  /* UPDATE_NONE as we don't want to tag the integrator as modified, just tag dependent things */
+  const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
+  integrator->set_use_denoise(denoise_params.use);
+
+  /* Only update denoiser parameters if the denoiser is actually used. This allows to tweak
+   * denoiser parameters before enabling it without render resetting on every change. The downside
+   * is that the interface and the integrator are technically out of sync. */
+  if (denoise_params.use) {
+    integrator->set_denoiser_type(denoise_params.type);
+    integrator->set_denoise_start_sample(denoise_params.start_sample);
+    integrator->set_use_denoise_pass_albedo(denoise_params.use_pass_albedo);
+    integrator->set_use_denoise_pass_normal(denoise_params.use_pass_normal);
+    integrator->set_denoiser_prefilter(denoise_params.prefilter);
+  }
+
+  /* UPDATE_NONE as we don't want to tag the integrator as modified (this was done by the
+   * set calls above), but we need to make sure that the dependent things are tagged. */
   integrator->tag_update(scene, Integrator::UPDATE_NONE);
 }
 
 /* Film */
 
-void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
+void BlenderSync::sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
   Film *film = scene->film;
 
   if (b_v3d) {
-    film->set_display_pass(update_viewport_display_passes(b_v3d, scene->passes));
+    const BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
+    film->set_display_pass(new_viewport_parameters.display_pass);
+    film->set_show_active_pixels(new_viewport_parameters.show_active_pixels);
   }
 
   film->set_exposure(get_float(cscene, "film_exposure"));
@@ -434,6 +423,15 @@ void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
         break;
     }
   }
+
+  /* Blender viewport does not support proper shadow catcher compositing, so force an approximate
+   * mode to improve visual feedback. */
+  if (b_v3d) {
+    film->set_use_approximate_shadow_catcher(true);
+  }
+  else {
+    film->set_use_approximate_shadow_catcher(!get_boolean(crl, "use_pass_shadow_catcher"));
+  }
 }
 
 /* Render Layer */
@@ -444,7 +442,6 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
 
   /* Filter. */
   view_layer.use_background_shader = b_view_layer.use_sky();
-  view_layer.use_background_ao = b_view_layer.use_ao();
   /* Always enable surfaces for baking, otherwise there is nothing to bake to. */
   view_layer.use_surfaces = b_view_layer.use_solid() || scene->bake_manager->get_baking();
   view_layer.use_hair = b_view_layer.use_strand();
@@ -464,10 +461,7 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
 
   if (use_layer_samples != 2) {
     int samples = b_view_layer.samples();
-    if (get_boolean(cscene, "use_square_samples"))
-      view_layer.samples = samples * samples;
-    else
-      view_layer.samples = samples;
+    view_layer.samples = samples;
   }
 }
 
@@ -499,7 +493,8 @@ void BlenderSync::sync_images()
 }
 
 /* Passes */
-PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
+
+static PassType get_blender_pass_type(BL::RenderPass &b_pass)
 {
   string name = b_pass.name();
 #define MAP_PASS(passname, passtype) \
@@ -507,10 +502,15 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
     return passtype; \
   } \
   ((void)0)
+
   /* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+
   MAP_PASS("Combined", PASS_COMBINED);
+  MAP_PASS("Noisy Image", PASS_COMBINED);
+
   MAP_PASS("Depth", PASS_DEPTH);
   MAP_PASS("Mist", PASS_MIST);
+  MAP_PASS("Position", PASS_POSITION);
   MAP_PASS("Normal", PASS_NORMAL);
   MAP_PASS("IndexOB", PASS_OBJECT_ID);
   MAP_PASS("UV", PASS_UV);
@@ -539,118 +539,92 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
   MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
 
+  MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL);
+  MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO);
+
+  MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER);
+  MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER);
+
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+
   MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
   MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
+
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
+
 #undef MAP_PASS
 
   return PASS_NONE;
 }
 
-int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
+static Pass *pass_add(Scene *scene,
+                      PassType type,
+                      const char *name,
+                      PassMode mode = PassMode::DENOISED)
 {
-  string name = b_pass.name();
+  Pass *pass = scene->create_node<Pass>();
 
-  if (name == "Noisy Image")
-    return DENOISING_PASS_PREFILTERED_COLOR;
+  pass->set_type(type);
+  pass->set_name(ustring(name));
+  pass->set_mode(mode);
 
-  if (name.substr(0, 10) != "Denoising ") {
-    return -1;
-  }
-  name = name.substr(10);
-
-#define MAP_PASS(passname, offset) \
-  if (name == passname) { \
-    return offset; \
-  } \
-  ((void)0)
-  MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
-  MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
-  MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
-  MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
-  MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
-  MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
-  MAP_PASS("Clean", DENOISING_PASS_CLEAN);
-#undef MAP_PASS
-
-  return -1;
+  return pass;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
-                                             BL::RenderLayer &b_rlay,
-                                             BL::ViewLayer &b_view_layer,
-                                             bool adaptive_sampling,
-                                             const DenoiseParams &denoising)
+void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
 {
-  vector<Pass> passes;
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+  /* Delete all existing passes. */
+  set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end());
+  scene->delete_nodes(clear_passes);
 
-  /* loop over passes */
+  /* Always add combined pass. */
+  pass_add(scene, PASS_COMBINED, "Combined");
+
+  /* Blender built-in data and light passes. */
   for (BL::RenderPass &b_pass : b_rlay.passes) {
-    PassType pass_type = get_pass_type(b_pass);
+    const PassType pass_type = get_blender_pass_type(b_pass);
+
+    if (pass_type == PASS_NONE) {
+      LOG(ERROR) << "Unknown pass " << b_pass.name();
+      continue;
+    }
 
     if (pass_type == PASS_MOTION &&
         (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
       continue;
     }
-    if (pass_type != PASS_NONE)
-      Pass::add(pass_type, passes, b_pass.name().c_str());
-  }
-
-  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
-  int denoising_flags = 0;
-  if (denoising.use || denoising.store_passes) {
-    if (denoising.type == DENOISER_NLM) {
-#define MAP_OPTION(name, flag) \
-  if (!get_boolean(crl, name)) { \
-    denoising_flags |= flag; \
-  } \
-  ((void)0)
-      MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
-      MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
-      MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
-      MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
-      MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
-      MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
-#undef MAP_OPTION
-    }
-    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+    pass_add(scene, pass_type, b_pass.name().c_str());
   }
-  scene->film->set_denoising_flags(denoising_flags);
-
-  if (denoising.store_passes) {
-    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
-    if (denoising.type == DENOISER_NLM) {
-      b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
-      b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
-      b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
-    }
 
-    if (scene->film->get_denoising_flags() & DENOISING_CLEAN_ALL_PASSES) {
-      b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
-    }
-  }
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
+  /* Debug passes. */
   if (get_boolean(crl, "pass_debug_render_time")) {
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
+    pass_add(scene, PASS_RENDER_TIME, "Debug Render Time");
   }
   if (get_boolean(crl, "pass_debug_sample_count")) {
     b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+    pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count");
   }
+
+  /* Cycles specific passes. */
   if (get_boolean(crl, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
+    pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir");
   }
   if (get_boolean(crl, "use_pass_volume_indirect")) {
     b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd");
+    pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd");
+  }
+  if (get_boolean(crl, "use_pass_shadow_catcher")) {
+    b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+    pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher");
   }
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
@@ -662,7 +636,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Object%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT);
   }
@@ -670,7 +644,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Material%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL);
   }
@@ -678,22 +652,33 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET);
   }
-  if (b_view_layer.use_pass_cryptomatte_accurate() && cryptomatte_passes != CRYPT_NONE) {
-    cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ACCURATE);
-  }
   scene->film->set_cryptomatte_passes(cryptomatte_passes);
 
-  if (adaptive_sampling) {
-    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
-    if (!get_boolean(crl, "pass_debug_sample_count")) {
-      Pass::add(PASS_SAMPLE_COUNT, passes);
+  /* Denoising passes. */
+  const bool use_denoising = get_boolean(cscene, "use_denoising") &&
+                             get_boolean(crl, "use_denoising");
+  const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes");
+  if (use_denoising) {
+    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+    pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY);
+    if (get_boolean(crl, "use_pass_shadow_catcher")) {
+      b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+      pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY);
     }
   }
+  if (store_denoising_passes) {
+    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
+    pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY);
+
+    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
+    pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY);
+  }
 
+  /* Custom AOV passes. */
   BL::ViewLayer::aovs_iterator b_aov_iter;
   for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) {
     BL::AOV b_aov(*b_aov_iter);
@@ -706,28 +691,15 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
 
     if (is_color) {
       b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_AOV_COLOR, passes, name.c_str());
+      pass_add(scene, PASS_AOV_COLOR, name.c_str());
     }
     else {
       b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
-      Pass::add(PASS_AOV_VALUE, passes, name.c_str());
+      pass_add(scene, PASS_AOV_VALUE, name.c_str());
     }
   }
 
-  scene->film->set_denoising_data_pass(denoising.use || denoising.store_passes);
-  scene->film->set_denoising_clean_pass(scene->film->get_denoising_flags() &
-                                        DENOISING_CLEAN_ALL_PASSES);
-  scene->film->set_denoising_prefiltered_pass(denoising.store_passes &&
-                                              denoising.type == DENOISER_NLM);
   scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold());
-
-  if (!Pass::equals(passes, scene->passes)) {
-    scene->film->tag_passes_update(scene, passes);
-    scene->film->tag_modified();
-    scene->integrator->tag_update(scene, Integrator::UPDATE_ALL);
-  }
-
-  return passes;
 }
 
 void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
@@ -773,9 +745,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
     params.shadingsystem = SHADINGSYSTEM_OSL;
 
   if (background || DebugFlags().viewport_static_bvh)
-    params.bvh_type = SceneParams::BVH_STATIC;
+    params.bvh_type = BVH_TYPE_STATIC;
   else
-    params.bvh_type = SceneParams::BVH_DYNAMIC;
+    params.bvh_type = BVH_TYPE_DYNAMIC;
 
   params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
   params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
@@ -818,8 +790,7 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background)
 SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
                                               BL::Preferences &b_preferences,
                                               BL::Scene &b_scene,
-                                              bool background,
-                                              BL::ViewLayer b_view_layer)
+                                              bool background)
 {
   SessionParams params;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -827,7 +798,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   /* feature set */
   params.experimental = (get_enum(cscene, "feature_set") != 0);
 
-  /* Background */
+  /* Headless and background rendering. */
+  params.headless = BlenderSession::headless;
   params.background = background;
 
   /* Device */
@@ -836,111 +808,26 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* samples */
   int samples = get_int(cscene, "samples");
-  int aa_samples = get_int(cscene, "aa_samples");
   int preview_samples = get_int(cscene, "preview_samples");
-  int preview_aa_samples = get_int(cscene, "preview_aa_samples");
 
-  if (get_boolean(cscene, "use_square_samples")) {
-    aa_samples = aa_samples * aa_samples;
-    preview_aa_samples = preview_aa_samples * preview_aa_samples;
-
-    samples = samples * samples;
-    preview_samples = preview_samples * preview_samples;
-  }
-
-  if (get_enum(cscene, "progressive") == 0 && params.device.has_branched_path) {
-    if (background) {
-      params.samples = aa_samples;
-    }
-    else {
-      params.samples = preview_aa_samples;
-      if (params.samples == 0)
-        params.samples = INT_MAX;
-    }
+  if (background) {
+    params.samples = samples;
   }
   else {
-    if (background) {
-      params.samples = samples;
-    }
-    else {
-      params.samples = preview_samples;
-      if (params.samples == 0)
-        params.samples = INT_MAX;
-    }
+    params.samples = preview_samples;
+    if (params.samples == 0)
+      params.samples = INT_MAX;
   }
 
   /* Clamp samples. */
   params.samples = min(params.samples, Integrator::MAX_SAMPLES);
 
-  /* Adaptive sampling. */
-  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
-
-  /* tiles */
-  const bool is_cpu = (params.device.type == DEVICE_CPU);
-  if (!is_cpu && !background) {
-    /* currently GPU could be much slower than CPU when using tiles,
-     * still need to be investigated, but meanwhile make it possible
-     * to work in viewport smoothly
-     */
-    int debug_tile_size = get_int(cscene, "debug_tile_size");
-
-    params.tile_size = make_int2(debug_tile_size, debug_tile_size);
-  }
-  else {
-    int tile_x = b_engine.tile_x();
-    int tile_y = b_engine.tile_y();
-
-    params.tile_size = make_int2(tile_x, tile_y);
-  }
-
-  if ((BlenderSession::headless == false) && background) {
-    params.tile_order = (TileOrder)get_enum(cscene, "tile_order");
-  }
-  else {
-    params.tile_order = TILE_BOTTOM_TO_TOP;
-  }
-
-  /* Denoising */
-  params.denoising = get_denoise_params(b_scene, b_view_layer, background);
-
-  if (params.denoising.use) {
-    /* Add additional denoising devices if we are rendering and denoising
-     * with different devices. */
-    params.device.add_denoising_devices(params.denoising.type);
-
-    /* Check if denoiser is supported by device. */
-    if (!(params.device.denoisers & params.denoising.type)) {
-      params.denoising.use = false;
-    }
-  }
-
   /* Viewport Performance */
-  params.start_resolution = get_int(cscene, "preview_start_resolution");
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
-  /* other parameters */
-  params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout");
-  params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout");
-  params.text_timeout = (double)get_float(cscene, "debug_text_timeout");
-
-  /* progressive refine */
-  BL::RenderSettings b_r = b_scene.render();
-  params.progressive_refine = b_engine.is_preview() ||
-                              get_boolean(cscene, "use_progressive_refine");
-  if (b_r.use_save_buffers() || params.adaptive_sampling)
-    params.progressive_refine = false;
-
   if (background) {
-    if (params.progressive_refine)
-      params.progressive = true;
-    else
-      params.progressive = false;
-
-    params.start_resolution = INT_MAX;
     params.pixel_size = 1;
   }
-  else
-    params.progressive = true;
 
   /* shading system - scene level needs full refresh */
   const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system");
@@ -950,19 +837,30 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   else if (shadingsystem == 1)
     params.shadingsystem = SHADINGSYSTEM_OSL;
 
-  /* Color management. */
-  params.display_buffer_linear = b_engine.support_display_space_shader(b_scene);
-
-  if (b_engine.is_preview()) {
-    /* For preview rendering we're using same timeout as
-     * blender's job update.
-     */
-    params.progressive_update_timeout = 0.1;
+  /* Time limit. */
+  if (background) {
+    params.time_limit = get_float(cscene, "time_limit");
+  }
+  else {
+    /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
+     * usually higher than acceptable level for the final frame. */
+    /* TODO: It might be useful to support time limit in the viewport as well, but needs some
+     * extra thoughts and input. */
+    params.time_limit = 0.0;
   }
 
+  /* Profiling. */
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  if (background) {
+    params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile");
+    params.tile_size = get_int(cscene, "tile_size");
+  }
+  else {
+    params.use_auto_tile = false;
+  }
+
   return params;
 }
 
@@ -970,33 +868,34 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
                                               BL::ViewLayer &b_view_layer,
                                               bool background)
 {
+  enum DenoiserInput {
+    DENOISER_INPUT_RGB = 1,
+    DENOISER_INPUT_RGB_ALBEDO = 2,
+    DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+    DENOISER_INPUT_NUM,
+  };
+
   DenoiseParams denoising;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
+  int input_passes = -1;
+
   if (background) {
     /* Final Render Denoising */
     denoising.use = get_boolean(cscene, "use_denoising");
     denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.prefilter = (DenoiserPrefilter)get_enum(
+        cscene, "denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_NONE);
+
+    input_passes = (DenoiserInput)get_enum(
+        cscene, "denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO_NORMAL);
 
     if (b_view_layer) {
       PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles");
       if (!get_boolean(clayer, "use_denoising")) {
         denoising.use = false;
       }
-
-      denoising.radius = get_int(clayer, "denoising_radius");
-      denoising.strength = get_float(clayer, "denoising_strength");
-      denoising.feature_strength = get_float(clayer, "denoising_feature_strength");
-      denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca");
-
-      denoising.input_passes = (DenoiserInput)get_enum(
-          clayer,
-          (denoising.type == DENOISER_OPTIX) ? "denoising_optix_input_passes" :
-                                               "denoising_openimagedenoise_input_passes",
-          DENOISER_INPUT_NUM,
-          DENOISER_INPUT_RGB_ALBEDO_NORMAL);
-
-      denoising.store_passes = get_boolean(clayer, "denoising_store_passes");
     }
   }
   else {
@@ -1004,10 +903,12 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
     denoising.use = get_boolean(cscene, "use_preview_denoising");
     denoising.type = (DenoiserType)get_enum(
         cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.prefilter = (DenoiserPrefilter)get_enum(
+        cscene, "preview_denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_FAST);
     denoising.start_sample = get_int(cscene, "preview_denoising_start_sample");
 
-    denoising.input_passes = (DenoiserInput)get_enum(
-        cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, (int)denoising.input_passes);
+    input_passes = (DenoiserInput)get_enum(
+        cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO);
 
     /* Auto select fastest denoiser. */
     if (denoising.type == DENOISER_NONE) {
@@ -1023,6 +924,27 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
     }
   }
 
+  switch (input_passes) {
+    case DENOISER_INPUT_RGB:
+      denoising.use_pass_albedo = false;
+      denoising.use_pass_normal = false;
+      break;
+
+    case DENOISER_INPUT_RGB_ALBEDO:
+      denoising.use_pass_albedo = true;
+      denoising.use_pass_normal = false;
+      break;
+
+    case DENOISER_INPUT_RGB_ALBEDO_NORMAL:
+      denoising.use_pass_albedo = true;
+      denoising.use_pass_normal = true;
+      break;
+
+    default:
+      LOG(ERROR) << "Unhandled input passes enum " << input_passes;
+      break;
+  }
+
   return denoising;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index d25c0ce1bc3..786479ac0f8 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -60,6 +60,7 @@ class BlenderSync {
               BL::Scene &b_scene,
               Scene *scene,
               bool preview,
+              bool use_developer_ui,
               Progress &progress);
   ~BlenderSync();
 
@@ -75,12 +76,8 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::Scene &b_scene,
-                                  BL::RenderLayer &b_render_layer,
-                                  BL::ViewLayer &b_view_layer,
-                                  bool adaptive_sampling,
-                                  const DenoiseParams &denoising);
-  void sync_integrator();
+  void sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  void sync_integrator(BL::ViewLayer &b_view_layer, bool background);
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
                    int width,
@@ -98,22 +95,13 @@ class BlenderSync {
 
   /* get parameters */
   static SceneParams get_scene_params(BL::Scene &b_scene, bool background);
-  static SessionParams get_session_params(
-      BL::RenderEngine &b_engine,
-      BL::Preferences &b_userpref,
-      BL::Scene &b_scene,
-      bool background,
-      BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL));
+  static SessionParams get_session_params(BL::RenderEngine &b_engine,
+                                          BL::Preferences &b_userpref,
+                                          BL::Scene &b_scene,
+                                          bool background);
   static bool get_session_pause(BL::Scene &b_scene, bool background);
-  static BufferParams get_buffer_params(BL::SpaceView3D &b_v3d,
-                                        BL::RegionView3D &b_rv3d,
-                                        Camera *cam,
-                                        int width,
-                                        int height,
-                                        const bool use_denoiser);
-
-  static PassType get_pass_type(BL::RenderPass &b_pass);
-  static int get_denoising_pass(BL::RenderPass &b_pass);
+  static BufferParams get_buffer_params(
+      BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height);
 
  private:
   static DenoiseParams get_denoise_params(BL::Scene &b_scene,
@@ -131,7 +119,7 @@ class BlenderSync {
                    int width,
                    int height,
                    void **python_thread_state);
-  void sync_film(BL::SpaceView3D &b_v3d);
+  void sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d);
   void sync_view();
 
   /* Shader */
@@ -245,6 +233,7 @@ class BlenderSync {
   Scene *scene;
   bool preview;
   bool experimental;
+  bool use_developer_ui;
 
   float dicing_rate;
   int max_subdivisions;
@@ -253,7 +242,6 @@ class BlenderSync {
     RenderLayerInfo()
         : material_override(PointerRNA_NULL),
           use_background_shader(true),
-          use_background_ao(true),
           use_surfaces(true),
           use_hair(true),
           use_volumes(true),
@@ -266,7 +254,6 @@ class BlenderSync {
     string name;
     BL::Material material_override;
     bool use_background_shader;
-    bool use_background_ao;
     bool use_surfaces;
     bool use_hair;
     bool use_volumes;
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
index 18bdfc74de0..62e32240bba 100644
--- a/intern/cycles/blender/blender_viewport.cpp
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -17,6 +17,8 @@
 #include "blender_viewport.h"
 
 #include "blender_util.h"
+#include "render/pass.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -26,11 +28,12 @@ BlenderViewportParameters::BlenderViewportParameters()
       studiolight_rotate_z(0.0f),
       studiolight_intensity(1.0f),
       studiolight_background_alpha(1.0f),
-      display_pass(PASS_COMBINED)
+      display_pass(PASS_COMBINED),
+      show_active_pixels(false)
 {
 }
 
-BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
+BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui)
     : BlenderViewportParameters()
 {
   if (!b_v3d) {
@@ -55,7 +58,25 @@ BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
   }
 
   /* Film. */
-  display_pass = (PassType)get_enum(cshading, "render_pass", -1, -1);
+
+  /* Lookup display pass based on the enum identifier.
+   * This is because integer values of python enum are not aligned with the passes definition in
+   * the kernel. */
+
+  display_pass = PASS_COMBINED;
+
+  const string display_pass_identifier = get_enum_identifier(cshading, "render_pass");
+  if (!display_pass_identifier.empty()) {
+    const ustring pass_type_identifier(string_to_lower(display_pass_identifier));
+    const NodeEnum *pass_type_enum = Pass::get_type_enum();
+    if (pass_type_enum->exists(pass_type_identifier)) {
+      display_pass = static_cast<PassType>((*pass_type_enum)[pass_type_identifier]);
+    }
+  }
+
+  if (use_developer_ui) {
+    show_active_pixels = get_boolean(cshading, "show_active_pixels");
+  }
 }
 
 bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters &other) const
@@ -69,7 +90,7 @@ bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters
 
 bool BlenderViewportParameters::film_modified(const BlenderViewportParameters &other) const
 {
-  return display_pass != other.display_pass;
+  return display_pass != other.display_pass || show_active_pixels != other.show_active_pixels;
 }
 
 bool BlenderViewportParameters::modified(const BlenderViewportParameters &other) const
@@ -82,18 +103,4 @@ bool BlenderViewportParameters::use_custom_shader() const
   return !(use_scene_world && use_scene_lights);
 }
 
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
-{
-  if (b_v3d) {
-    const BlenderViewportParameters viewport_parameters(b_v3d);
-    const PassType display_pass = viewport_parameters.display_pass;
-
-    passes.clear();
-    Pass::add(display_pass, passes);
-
-    return display_pass;
-  }
-  return PASS_NONE;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
index d6518597053..b5adafc30c9 100644
--- a/intern/cycles/blender/blender_viewport.h
+++ b/intern/cycles/blender/blender_viewport.h
@@ -39,9 +39,10 @@ class BlenderViewportParameters {
 
   /* Film. */
   PassType display_pass;
+  bool show_active_pixels;
 
   BlenderViewportParameters();
-  explicit BlenderViewportParameters(BL::SpaceView3D &b_v3d);
+  BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui);
 
   /* Check whether any of shading related settings are different from the given parameters. */
   bool shader_modified(const BlenderViewportParameters &other) const;
@@ -57,8 +58,6 @@ class BlenderViewportParameters {
   bool use_custom_shader() const;
 };
 
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
-
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 048c2b95e40..d3497f3a8d8 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -832,18 +832,18 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   typedef StackAllocator<256, float2> LeafTimeStackAllocator;
   typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
-  vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
-  vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
-  vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
-  vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
-  vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
+  vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM];
+  vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM];
+  vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM];
+  vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM];
+  vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM];
 
   /* TODO(sergey): In theory we should be able to store references. */
   vector<BVHReference, LeafReferenceStackAllocator> object_references;
 
-  uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
+  uint visibility[PRIMITIVE_NUM] = {0};
   /* NOTE: Keep initialization in sync with actual number of primitives. */
-  BoundBox bounds[PRIMITIVE_NUM_TOTAL] = {
+  BoundBox bounds[PRIMITIVE_NUM] = {
       BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty};
   int ob_num = 0;
   int num_new_prims = 0;
@@ -877,7 +877,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
    * TODO(sergey): With some pointer trickery we can write directly to the
    * destination buffers for the non-spatial split BVH.
    */
-  BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
+  BVHNode *leaves[PRIMITIVE_NUM + 1] = {NULL};
   int num_leaves = 0;
   size_t start_index = 0;
   vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object;
@@ -888,7 +888,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   if (need_prim_time) {
     local_prim_time.resize(num_new_prims);
   }
-  for (int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
+  for (int i = 0; i < PRIMITIVE_NUM; ++i) {
     int num = (int)p_type[i].size();
     if (num != 0) {
       assert(p_type[i].size() == p_index[i].size());
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 62f543941a9..96852510b63 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -37,10 +37,10 @@
 /* Kernel includes are necessary so that the filter function for Embree can access the packed BVH.
  */
 #  include "kernel/bvh/bvh_embree.h"
-#  include "kernel/kernel_compat_cpu.h"
-#  include "kernel/kernel_globals.h"
+#  include "kernel/bvh/bvh_util.h"
+#  include "kernel/device/cpu/compat.h"
+#  include "kernel/device/cpu/globals.h"
 #  include "kernel/kernel_random.h"
-#  include "kernel/split/kernel_split_data_types.h"
 
 #  include "render/hair.h"
 #  include "render/mesh.h"
@@ -73,46 +73,69 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   const RTCRay *ray = (RTCRay *)args->ray;
   RTCHit *hit = (RTCHit *)args->hit;
   CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  KernelGlobals *kg = ctx->kg;
+  const KernelGlobals *kg = ctx->kg;
 
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL: {
-      /* Append the intersection to the end of the array. */
-      if (ctx->num_hits < ctx->max_hits) {
-        Intersection current_isect;
-        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        for (size_t i = 0; i < ctx->max_hits; ++i) {
+      Intersection current_isect;
+      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+
+      /* If no transparent shadows, all light is blocked. */
+      const int flags = intersection_get_shader_flags(kg, &current_isect);
+      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->max_hits == 0) {
+        ctx->opaque_hit = true;
+        return;
+      }
+
+      /* Test if we need to record this transparent intersection. */
+      if (ctx->num_hits < ctx->max_hits || ray->tfar < ctx->max_t) {
+        /* Skip already recorded intersections. */
+        int num_recorded_hits = min(ctx->num_hits, ctx->max_hits);
+
+        for (int i = 0; i < num_recorded_hits; ++i) {
           if (current_isect.object == ctx->isect_s[i].object &&
               current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
             /* This intersection was already recorded, skip it. */
             *args->valid = 0;
-            break;
+            return;
           }
         }
-        Intersection *isect = &ctx->isect_s[ctx->num_hits];
-        ++ctx->num_hits;
-        *isect = current_isect;
-        int prim = kernel_tex_fetch(__prim_index, isect->prim);
-        int shader = 0;
-        if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
-          shader = kernel_tex_fetch(__tri_shader, prim);
-        }
-        else {
-          float4 str = kernel_tex_fetch(__curves, prim);
-          shader = __float_as_int(str.z);
-        }
-        int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags;
-        /* If no transparent shadows, all light is blocked. */
-        if (flag & (SD_HAS_TRANSPARENT_SHADOW)) {
-          /* This tells Embree to continue tracing. */
-          *args->valid = 0;
+
+        /* If maximum number of hits was reached, replace the intersection with the
+         * highest distance. We want to find the N closest intersections. */
+        int isect_index = num_recorded_hits;
+        if (num_recorded_hits + 1 >= ctx->max_hits) {
+          float max_t = ctx->isect_s[0].t;
+          int max_recorded_hit = 0;
+
+          for (int i = 1; i < num_recorded_hits; ++i) {
+            if (ctx->isect_s[i].t > max_t) {
+              max_recorded_hit = i;
+              max_t = ctx->isect_s[i].t;
+            }
+          }
+
+          if (num_recorded_hits >= ctx->max_hits) {
+            isect_index = max_recorded_hit;
+          }
+
+          /* Limit the ray distance and stop counting hits beyond this.
+           * TODO: is there some way we can tell Embree to stop intersecting beyond
+           * this distance when max number of hits is reached?. Or maybe it will
+           * become irrelevant if we make max_hits a very high number on the CPU. */
+          ctx->max_t = max(current_isect.t, max_t);
         }
+
+        ctx->isect_s[isect_index] = current_isect;
       }
-      else {
-        /* Increase the number of hits beyond ray.max_hits
-         * so that the caller can detect this as opaque. */
-        ++ctx->num_hits;
-      }
+
+      /* Always increase the number of hits, even beyond ray.max_hits so that
+       * the caller can detect this as and consider it opaque, or trace another
+       * ray. */
+      ++ctx->num_hits;
+
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
       break;
     }
     case CCLIntersectContext::RAY_LOCAL:
@@ -329,7 +352,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_)
     scene = NULL;
   }
 
-  const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC;
+  const bool dynamic = params.bvh_type == BVH_TYPE_DYNAMIC;
 
   scene = rtcNewScene(rtc_device);
   const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) |
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2dc10f30363..31b3971c110 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -31,6 +31,27 @@ CCL_NAMESPACE_BEGIN
  */
 typedef KernelBVHLayout BVHLayout;
 
+/* Type of BVH, in terms whether it is supported dynamic updates of meshes
+ * or whether modifying geometry requires full BVH rebuild.
+ */
+enum BVHType {
+  /* BVH supports dynamic updates of geometry.
+   *
+   * Faster for updating BVH tree when doing modifications in viewport,
+   * but slower for rendering.
+   */
+  BVH_TYPE_DYNAMIC = 0,
+  /* BVH tree is calculated for specific scene, updates in geometry
+   * requires full tree rebuild.
+   *
+   * Slower to update BVH tree when modifying objects in viewport, also
+   * slower to build final BVH tree but gives best possible render speed.
+   */
+  BVH_TYPE_STATIC = 1,
+
+  BVH_NUM_TYPES,
+};
+
 /* Names bitflag type to denote which BVH layouts are supported by
  * particular area.
  *
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 04ff598621a..da259171844 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -287,9 +287,6 @@ if(CYCLES_STANDALONE_REPOSITORY)
   endif()
 
   set(__boost_packages filesystem regex system thread date_time)
-  if(WITH_CYCLES_NETWORK)
-    list(APPEND __boost_packages serialization)
-  endif()
   if(WITH_CYCLES_OSL)
     list(APPEND __boost_packages wave)
   endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
 
 set(SRC
   device.cpp
-  device_cpu.cpp
-  device_cuda.cpp
-  device_denoising.cpp
-  device_dummy.cpp
+  device_denoise.cpp
+  device_graphics_interop.cpp
+  device_kernel.cpp
   device_memory.cpp
-  device_multi.cpp
-  device_opencl.cpp
-  device_optix.cpp
-  device_split_kernel.cpp
-  device_task.cpp
+  device_queue.cpp
+)
+
+set(SRC_CPU
+  cpu/device.cpp
+  cpu/device.h
+  cpu/device_impl.cpp
+  cpu/device_impl.h
+  cpu/kernel.cpp
+  cpu/kernel.h
+  cpu/kernel_function.h
+  cpu/kernel_thread_globals.cpp
+  cpu/kernel_thread_globals.h
 )
 
 set(SRC_CUDA
-  cuda/device_cuda.h
-  cuda/device_cuda_impl.cpp
+  cuda/device.cpp
+  cuda/device.h
+  cuda/device_impl.cpp
+  cuda/device_impl.h
+  cuda/graphics_interop.cpp
+  cuda/graphics_interop.h
+  cuda/kernel.cpp
+  cuda/kernel.h
+  cuda/queue.cpp
+  cuda/queue.h
+  cuda/util.cpp
+  cuda/util.h
 )
 
-set(SRC_OPENCL
-  opencl/device_opencl.h
-  opencl/device_opencl_impl.cpp
-  opencl/memory_manager.h
-  opencl/memory_manager.cpp
-  opencl/opencl_util.cpp
+set(SRC_DUMMY
+  dummy/device.cpp
+  dummy/device.h
 )
 
-if(WITH_CYCLES_NETWORK)
-  list(APPEND SRC
-    device_network.cpp
-  )
-endif()
+set(SRC_MULTI
+  multi/device.cpp
+  multi/device.h
+)
+
+set(SRC_OPTIX
+  optix/device.cpp
+  optix/device.h
+  optix/device_impl.cpp
+  optix/device_impl.h
+  optix/queue.cpp
+  optix/queue.h
+  optix/util.h
+)
 
 set(SRC_HEADERS
   device.h
-  device_denoising.h
+  device_denoise.h
+  device_graphics_interop.h
   device_memory.h
-  device_intern.h
-  device_network.h
-  device_split_kernel.h
-  device_task.h
+  device_kernel.h
+  device_queue.h
 )
 
 set(LIB
-  cycles_render
   cycles_kernel
   cycles_util
   ${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
 endif()
 
 add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
-  list(APPEND LIB
-    extern_clew
-  )
-  add_definitions(-DWITH_OPENCL)
-endif()
+
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
 endif()
 
 if(WITH_OPENIMAGEDENOISE)
-  add_definitions(-DWITH_OPENIMAGEDENOISE)
-  add_definitions(-DOIDN_STATIC_LIB)
-  list(APPEND INC_SYS
-    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
-  )
   list(APPEND LIB
     ${OPENIMAGEDENOISE_LIBRARIES}
-    ${TBB_LIBRARIES}
   )
 endif()
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+  ${SRC}
+  ${SRC_CPU}
+  ${SRC_CUDA}
+  ${SRC_DUMMY}
+  ${SRC_MULTI}
+  ${SRC_OPTIX}
+  ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+  DeviceInfo info;
+
+  info.type = DEVICE_CPU;
+  info.description = system_cpu_brand_string();
+  info.id = "CPU";
+  info.num = 0;
+  info.has_osl = true;
+  info.has_half_images = true;
+  info.has_nanovdb = true;
+  info.has_profiling = true;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
+
+  devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+  string capabilities = "";
+  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_avx() ? "AVX " : "";
+  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+  if (capabilities[capabilities.size() - 1] == ' ')
+    capabilities.resize(capabilities.size() - 1);
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/device/cpu/device.h
index dcea2630aef..9cb2e80068d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/device/cpu/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,22 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_buffer_update.h"
+#pragma once
 
-#define KERNEL_NAME buffer_update
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+    : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+   * optimization. */
+  VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+          << " kernels.";
+
+  if (info.cpu_threads == 0) {
+    info.cpu_threads = TaskScheduler::num_threads();
+  }
+
+#ifdef WITH_OSL
+  kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  embree_device = rtcNewDevice("verbose=0");
+#endif
+  need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+  rtcReleaseDevice(embree_device);
+#endif
+
+  texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+  return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+  bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+  return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+  if (!need_texture_info) {
+    return false;
+  }
+
+  texture_info.copy_to_device();
+  need_texture_info = false;
+
+  return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+    }
+
+    if (mem.type == MEM_DEVICE_ONLY) {
+      assert(!mem.host_pointer);
+      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+      void *data = util_aligned_malloc(mem.memory_size(), alignment);
+      mem.device_pointer = (device_ptr)data;
+    }
+    else {
+      mem.device_pointer = (device_ptr)mem.host_pointer;
+    }
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    /* copy is no-op */
+  }
+}
+
+void CPUDevice::mem_copy_from(
+    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+  /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+
+  if (mem.device_pointer) {
+    memset((void *)mem.device_pointer, 0, mem.memory_size());
+  }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else if (mem.device_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      util_aligned_free((void *)mem.device_pointer);
+    }
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    // Update scene handle (since it is different for each device on multi devices)
+    KernelData *const data = (KernelData *)host;
+    data->bvh.scene = embree_scene;
+  }
+#endif
+  kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)mem.host_pointer;
+  need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+    need_texture_info = true;
+  }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+  if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+      bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device);
+    }
+
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else
+#endif
+    Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+  scoped_timer timer(&tile.buffers->render_time);
+
+  Coverage coverage(kg, tile);
+  if (use_coverage) {
+    coverage.init_path_trace();
+  }
+
+  float *render_buffer = (float *)tile.buffer;
+  int start_sample = tile.start_sample;
+  int end_sample = tile.start_sample + tile.num_samples;
+
+  /* Needed for Embree. */
+  SIMD_SET_FLUSH_TO_ZERO;
+
+  for (int sample = start_sample; sample < end_sample; sample++) {
+    if (task.get_cancel() || TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+
+    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+      tile.stealing_state = RenderTile::WAS_STOLEN;
+      break;
+    }
+
+    if (tile.task == RenderTile::PATH_TRACE) {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          if (use_coverage) {
+            coverage.init_pixel(x, y);
+          }
+          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    else {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    tile.sample = sample + 1;
+
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+      const bool stop = adaptive_sampling_filter(kg, tile, sample);
+      if (stop) {
+        const int num_progress_samples = end_sample - sample;
+        tile.sample = end_sample;
+        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+        break;
+      }
+    }
+
+    task.update_progress(&tile, tile.w * tile.h);
+  }
+  if (use_coverage) {
+    coverage.finalize();
+  }
+
+  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+    adaptive_sampling_post(tile, kg);
+  }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+  if (TaskPool::canceled()) {
+    if (task.need_finish_queue == false)
+      return;
+  }
+
+  /* allocate buffer for kernel globals */
+  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+  profiler.add_state(&kg.profiler);
+
+  /* NLM denoiser. */
+  DenoisingTask *denoising = NULL;
+
+  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+   * avoid waiting with mutex locks in the denoiser, we let only a single
+   * thread acquire denoising tiles. */
+  uint tile_types = task.tile_types;
+  bool hold_denoise_lock = false;
+  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+    if (!oidn_task_lock.try_lock()) {
+      tile_types &= ~RenderTile::DENOISE;
+      hold_denoise_lock = true;
+    }
+  }
+
+  RenderTile tile;
+  while (task.acquire_tile(this, tile, tile_types)) {
+    if (tile.task == RenderTile::PATH_TRACE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::BAKE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::DENOISE) {
+      denoise_openimagedenoise(task, tile);
+      task.update_progress(&tile, tile.w * tile.h);
+    }
+
+    task.release_tile(tile);
+
+    if (TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  if (hold_denoise_lock) {
+    oidn_task_lock.unlock();
+  }
+
+  profiler.remove_state(&kg.profiler);
+
+  delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+  RenderTile tile;
+  tile.x = task.x;
+  tile.y = task.y;
+  tile.w = task.w;
+  tile.h = task.h;
+  tile.buffer = task.buffer;
+  tile.sample = task.sample + task.num_samples;
+  tile.num_samples = task.num_samples;
+  tile.start_sample = task.sample;
+  tile.offset = task.offset;
+  tile.stride = task.stride;
+  tile.buffers = task.buffers;
+
+  denoise_openimagedenoise(task, tile);
+
+  task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  /* Ensure latest texture info is loaded into kernel globals before returning. */
+  load_texture_info();
+
+  kernel_thread_globals.clear();
+  void *osl_memory = get_cpu_osl_memory();
+  for (int i = 0; i < info.cpu_threads; i++) {
+    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+  }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+  return &osl_globals;
+#else
+  return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+  KernelGlobals kernel_globals;
+
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+#ifdef WITH_OSL
+  OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
+  RTCDevice embree_device;
+#endif
+
+  CPUKernels kernels;
+
+  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+  ~CPUDevice();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  bool load_texture_info();
+
+  virtual void mem_alloc(device_memory &mem) override;
+  virtual void mem_copy_to(device_memory &mem) override;
+  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_zero(device_memory &mem) override;
+  virtual void mem_free(device_memory &mem) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  virtual const CPUKernels *get_cpu_kernels() const override;
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+  virtual void *get_cpu_osl_memory() override;
+
+ protected:
+  virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..0ab58ff8600
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+    : /* Integrator. */
+      REGISTER_KERNEL(integrator_init_from_camera),
+      REGISTER_KERNEL(integrator_init_from_bake),
+      REGISTER_KERNEL(integrator_intersect_closest),
+      REGISTER_KERNEL(integrator_intersect_shadow),
+      REGISTER_KERNEL(integrator_intersect_subsurface),
+      REGISTER_KERNEL(integrator_intersect_volume_stack),
+      REGISTER_KERNEL(integrator_shade_background),
+      REGISTER_KERNEL(integrator_shade_light),
+      REGISTER_KERNEL(integrator_shade_shadow),
+      REGISTER_KERNEL(integrator_shade_surface),
+      REGISTER_KERNEL(integrator_shade_volume),
+      REGISTER_KERNEL(integrator_megakernel),
+      /* Shader evaluation. */
+      REGISTER_KERNEL(shader_eval_displace),
+      REGISTER_KERNEL(shader_eval_background),
+      /* Adaptive campling. */
+      REGISTER_KERNEL(adaptive_sampling_convergence_check),
+      REGISTER_KERNEL(adaptive_sampling_filter_x),
+      REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Cryptomatte. */
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Bake. */
+      REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+  /* Integrator. */
+
+  using IntegratorFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                                            IntegratorStateCPU *state,
+                                                            KernelWorkTile *tile,
+                                                            ccl_global float *render_buffer)>;
+
+  IntegratorInitFunction integrator_init_from_camera;
+  IntegratorInitFunction integrator_init_from_bake;
+  IntegratorFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_shadow;
+  IntegratorFunction integrator_intersect_subsurface;
+  IntegratorFunction integrator_intersect_volume_stack;
+  IntegratorShadeFunction integrator_shade_background;
+  IntegratorShadeFunction integrator_shade_light;
+  IntegratorShadeFunction integrator_shade_shadow;
+  IntegratorShadeFunction integrator_shade_surface;
+  IntegratorShadeFunction integrator_shade_volume;
+  IntegratorShadeFunction integrator_megakernel;
+
+  /* Shader evaluation. */
+
+  using ShaderEvalFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+  ShaderEvalFunction shader_eval_displace;
+  ShaderEvalFunction shader_eval_background;
+
+  /* Adaptive stopping. */
+
+  using AdaptiveSamplingConvergenceCheckFunction =
+      CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int y,
+                                 float threshold,
+                                 bool reset,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterXFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int y,
+                                 int start_x,
+                                 int width,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterYFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int start_y,
+                                 int height,
+                                 int offset,
+                                 int stride)>;
+
+  AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+  AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+  AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+  /* Cryptomatte. */
+
+  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+  CryptomattePostprocessFunction cryptomatte_postprocess;
+
+  /* Bake. */
+
+  CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+  CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+  CPUKernelFunction(FunctionType kernel_default,
+                    FunctionType kernel_sse2,
+                    FunctionType kernel_sse3,
+                    FunctionType kernel_sse41,
+                    FunctionType kernel_avx,
+                    FunctionType kernel_avx2)
+  {
+    kernel_info_ = get_best_kernel_info(
+        kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+  }
+
+  template<typename... Args> inline auto operator()(Args... args) const
+  {
+    assert(kernel_info_.kernel);
+
+    return kernel_info_.kernel(args...);
+  }
+
+  const char *get_uarch_name() const
+  {
+    return kernel_info_.uarch_name;
+  }
+
+ protected:
+  /* Helper class which allows to pass human-readable microarchitecture name together with function
+   * pointer. */
+  class KernelInfo {
+   public:
+    KernelInfo() : KernelInfo("", nullptr)
+    {
+    }
+
+    /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+     * memory allocation. */
+    KernelInfo(const char *uarch_name, FunctionType kernel)
+        : uarch_name(uarch_name), kernel(kernel)
+    {
+    }
+
+    const char *uarch_name;
+    FunctionType kernel;
+  };
+
+  KernelInfo get_best_kernel_info(FunctionType kernel_default,
+                                  FunctionType kernel_sse2,
+                                  FunctionType kernel_sse3,
+                                  FunctionType kernel_sse41,
+                                  FunctionType kernel_avx,
+                                  FunctionType kernel_avx2)
+  {
+    /* Silence warnings about unused variables when compiling without some architectures. */
+    (void)kernel_sse2;
+    (void)kernel_sse3;
+    (void)kernel_sse41;
+    (void)kernel_avx;
+    (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      return KernelInfo("AVX2", kernel_avx2);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+    if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+      return KernelInfo("AVX", kernel_avx);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+    if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+      return KernelInfo("SSE4.1", kernel_sse41);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+    if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+      return KernelInfo("SSE3", kernel_sse3);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      return KernelInfo("SSE2", kernel_sse2);
+    }
+#endif
+
+    return KernelInfo("default", kernel_default);
+  }
+
+  KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                                               void *osl_globals_memory,
+                                               Profiler &cpu_profiler)
+    : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+  reset_runtime_memory();
+
+#ifdef WITH_OSL
+  OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+  (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+    : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+  other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+  OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+  if (this == &other) {
+    return *this;
+  }
+
+  *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+  other.reset_runtime_memory();
+
+  return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+  osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+   * without OSL support. Will avoid need to those unnamed pointers and casts. */
+  CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                         void *osl_globals_memory,
+                         Profiler &cpu_profiler);
+
+  ~CPUKernelThreadGlobals();
+
+  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+  void start_profiling();
+  void stop_profiling();
+
+ protected:
+  void reset_runtime_memory();
+
+  Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
  * limitations under the License.
  */
 
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
 
-#  include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+#  include "device/cuda/device_impl.h"
 #  include "device/device.h"
-#  include "device/device_intern.h"
 
-#  include "util/util_logging.h"
 #  include "util/util_string.h"
 #  include "util/util_windows.h"
+#endif /* WITH_CUDA */
 
 CCL_NAMESPACE_BEGIN
 
 bool device_cuda_init()
 {
-#  ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+  return false;
+#elif defined(WITH_CUDA_DYNLOAD)
   static bool initialized = false;
   static bool result = false;
 
@@ -59,16 +63,27 @@ bool device_cuda_init()
   }
 
   return result;
-#  else  /* WITH_CUDA_DYNLOAD */
+#else  /* WITH_CUDA_DYNLOAD */
   return true;
-#  endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+  return new CUDADevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
 }
 
+#ifdef WITH_CUDA
 static CUresult device_cuda_safe_init()
 {
 #  ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
   return cuInit(0);
 #  endif
 }
+#endif /* WITH_CUDA */
 
 void device_cuda_info(vector<DeviceInfo> &devices)
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_nanovdb = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
 
     /* Check if the device has P2P access to any other device in the system. */
     for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
   if (!display_devices.empty())
     devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_CUDA */
+  (void)devices;
+#endif /* WITH_CUDA */
 }
 
 string device_cuda_capabilities()
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
   }
 
   return capabilities;
+
+#else  /* WITH_CUDA */
+  return "";
+#endif /* WITH_CUDA */
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/device/cuda/device.h
index e68d4104a91..b0484904d1a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
+++ b/intern/cycles/device/cuda/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,24 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
+#pragma once
 
-#define KERNEL_NAME enqueue_inactive
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_task.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
-#  else
-#    include "util/util_opengl.h"
-#    include <cuda.h>
-#    include <cudaGL.h>
-#  endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int pitch_alignment;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  /* Kernels */
-  struct {
-    bool loaded;
-
-    CUfunction adaptive_stopping;
-    CUfunction adaptive_filter_x;
-    CUfunction adaptive_filter_y;
-    CUfunction adaptive_scale_samples;
-    int adaptive_num_threads_per_block;
-  } functions;
-
-  static bool have_precompiled_kernels();
-
-  virtual bool show_samples() const override;
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
-  void set_error(const string &error) override;
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
-  virtual ~CUDADevice();
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
-  bool check_peer_access(Device *peer_device) override;
-
-  bool use_adaptive_compilation();
-
-  bool use_split_kernel();
-
-  virtual string compile_kernel_get_common_cflags(
-      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        const char *name,
-                        const char *base = "cuda",
-                        bool force_ptx = false);
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
-  void load_functions();
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
-
-  void mem_alloc(device_memory &mem) override;
-
-  void mem_copy_to(device_memory &mem) override;
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
-  void mem_zero(device_memory &mem) override;
-
-  void mem_free(device_memory &mem) override;
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
-  void global_alloc(device_memory &mem);
-
-  void global_free(device_memory &mem);
-
-  void tex_alloc(device_texture &mem);
-
-  void tex_free(device_texture &mem);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-
-  bool denoising_construct_transform(DenoisingTask *task);
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
-  void adaptive_sampling_filter(uint filter_sample,
-                                WorkTile *wtile,
-                                CUdeviceptr d_wtile,
-                                CUstream stream = 0);
-  void adaptive_sampling_post(RenderTile &rtile,
-                              WorkTile *wtile,
-                              CUdeviceptr d_wtile,
-                              CUstream stream = 0);
-
-  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-
-  void shader(DeviceTask &task);
-
-  CUdeviceptr map_pixels(device_ptr mem);
-
-  void unmap_pixels(device_ptr mem);
-
-  void pixels_alloc(device_memory &mem);
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
-  void pixels_free(device_memory &mem);
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override;
-
-  void thread_run(DeviceTask &task);
-
-  virtual void task_add(DeviceTask &task) override;
-
-  virtual void task_wait() override;
-
-  virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include <climits>
-#  include <limits.h>
-#  include <stdio.h>
-#  include <stdlib.h>
-#  include <string.h>
-
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_intern.h"
-#  include "device/device_split_kernel.h"
-
-#  include "render/buffers.h"
-
-#  include "kernel/filter/filter_defines.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_map.h"
-#  include "util/util_md5.h"
-#  include "util/util_opengl.h"
-#  include "util/util_path.h"
-#  include "util/util_string.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-#  include "util/util_types.h"
-#  include "util/util_windows.h"
-
-#  include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-#  ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#  endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
-  string cubins_path = path_get("lib");
-  return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
-  return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
-  Device::set_error(error);
-
-  if (first_error) {
-    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-    fprintf(stderr,
-            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-    first_error = false;
-  }
-}
-
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  first_error = true;
-  background = background_;
-
-  cuDevId = info.num;
-  cuDevice = 0;
-  cuContext = 0;
-
-  cuModule = 0;
-  cuFilterModule = 0;
-
-  split_kernel = NULL;
-
-  need_texture_info = false;
-
-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
-  pitch_alignment = 0;
-
-  functions.loaded = false;
-
-  /* Initialize CUDA. */
-  CUresult result = cuInit(0);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  /* Setup device and context. */
-  result = cuDeviceGet(&cuDevice, cuDevId);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
-                            cuewErrorString(result)));
-    return;
-  }
-
-  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-  cuda_assert(cuDeviceGetAttribute(
-      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
-  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-  if (can_map_host) {
-    ctx_flags |= CU_CTX_MAP_HOST;
-    init_host_memory();
-  }
-
-  /* Create context. */
-  if (background) {
-    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-  }
-  else {
-    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-    if (result != CUDA_SUCCESS) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-      background = true;
-    }
-  }
-
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-  cuDevArchitecture = major * 100 + minor * 10;
-
-  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
-  task_pool.cancel();
-
-  delete split_kernel;
-
-  texture_info.free();
-
-  cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* We only support sm_30 and above */
-  if (major < 3) {
-    set_error(string_printf(
-        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
-    return false;
-  }
-
-  return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
-  if (peer_device == this) {
-    return false;
-  }
-  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
-    return false;
-  }
-
-  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
-  int can_access = 0;
-  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Ensure array access over the link is possible as well (for 3D textures)
-  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
-                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
-                                      cuDevice,
-                                      peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Enable peer access in both directions
-  {
-    const CUDAContextScope scope(this);
-    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-  {
-    const CUDAContextScope scope(peer_device_cuda);
-    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
-  return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
-  return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
-    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
-  const int machine = system_cpu_bits();
-  const string source_path = path_get("source");
-  const string include_path = source_path;
-  string cflags = string_printf(
-      "-m%d "
-      "--ptxas-options=\"-v\" "
-      "--use_fast_math "
-      "-DNVCC "
-      "-I\"%s\"",
-      machine,
-      include_path.c_str());
-  if (!filter && use_adaptive_compilation()) {
-    cflags += " " + requested_features.get_build_options();
-  }
-  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-  if (extra_cflags) {
-    cflags += string(" ") + string(extra_cflags);
-  }
-
-  if (split) {
-    cflags += " -D__SPLIT__";
-  }
-
-#  ifdef WITH_NANOVDB
-  cflags += " -DWITH_NANOVDB";
-#  endif
-
-  return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
-                                  const char *name,
-                                  const char *base,
-                                  bool force_ptx)
-{
-  /* Compute kernel name. */
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* Attempt to use kernel provided with Blender. */
-  if (!use_adaptive_compilation()) {
-    if (!force_ptx) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-    }
-
-    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
-    int ptx_major = major, ptx_minor = minor;
-    while (ptx_major >= 3) {
-      const string ptx = path_get(
-          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-
-      if (ptx_minor > 0) {
-        ptx_minor--;
-      }
-      else {
-        ptx_major--;
-        ptx_minor = 9;
-      }
-    }
-  }
-
-  /* Try to use locally compiled kernel. */
-  string source_path = path_get("source");
-  const string source_md5 = path_files_md5_hash(source_path);
-
-  /* We include cflags into md5 so changing cuda toolkit or changing other
-   * compiler command line arguments makes sure cubin gets re-built.
-   */
-  string common_cflags = compile_kernel_get_common_cflags(
-      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
-  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
-  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
-  const char *const kernel_arch = force_ptx ? "compute" : "sm";
-  const string cubin_file = string_printf(
-      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
-  const string cubin = path_cache_get(path_join("kernels", cubin_file));
-  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-  if (path_exists(cubin)) {
-    VLOG(1) << "Using locally compiled kernel.";
-    return cubin;
-  }
-
-#  ifdef _WIN32
-  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (major < 3) {
-      set_error(
-          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
-                        "Your GPU is not supported.",
-                        major,
-                        minor));
-    }
-    else {
-      set_error(
-          string_printf("CUDA binary kernel for this graphics card compute "
-                        "capability (%d.%d) not found.",
-                        major,
-                        minor));
-    }
-    return string();
-  }
-#  endif
-
-  /* Compile. */
-  const char *const nvcc = cuewCompilerPath();
-  if (nvcc == NULL) {
-    set_error(
-        "CUDA nvcc compiler not found. "
-        "Install CUDA toolkit in default location.");
-    return string();
-  }
-
-  const int nvcc_cuda_version = cuewCompilerVersion();
-  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
-  if (nvcc_cuda_version < 101) {
-    printf(
-        "Unsupported CUDA version %d.%d detected, "
-        "you need CUDA 10.1 or newer.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-    return string();
-  }
-  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
-             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
-    printf(
-        "CUDA version %d.%d detected, build may succeed but only "
-        "CUDA 10.1 to 11.4 are officially supported.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-  }
-
-  double starttime = time_dt();
-
-  path_create_directories(cubin);
-
-  source_path = path_join(path_join(source_path, "kernel"),
-                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
-  string command = string_printf(
-      "\"%s\" "
-      "-arch=%s_%d%d "
-      "--%s \"%s\" "
-      "-o \"%s\" "
-      "%s",
-      nvcc,
-      kernel_arch,
-      major,
-      minor,
-      kernel_ext,
-      source_path.c_str(),
-      cubin.c_str(),
-      common_cflags.c_str());
-
-  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-#  ifdef _WIN32
-  command = "call " + command;
-#  endif
-  if (system(command.c_str()) != 0) {
-    set_error(
-        "Failed to execute compilation command, "
-        "see console for details.");
-    return string();
-  }
-
-  /* Verify if compilation succeeded */
-  if (!path_exists(cubin)) {
-    set_error(
-        "CUDA kernel compilation failed, "
-        "see console for details.");
-    return string();
-  }
-
-  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-  return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  /* TODO(sergey): Support kernels re-load for CUDA devices.
-   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
-   */
-  if (cuFilterModule && cuModule) {
-    VLOG(1) << "Skipping kernel reload, not currently supported.";
-    return true;
-  }
-
-  /* check if cuda init succeeded */
-  if (cuContext == 0)
-    return false;
-
-  /* check if GPU is supported */
-  if (!support_device(requested_features))
-    return false;
-
-  /* get kernel */
-  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
-  string cubin = compile_kernel(requested_features, kernel_name);
-  if (cubin.empty())
-    return false;
-
-  const char *filter_name = "filter";
-  string filter_cubin = compile_kernel(requested_features, filter_name);
-  if (filter_cubin.empty())
-    return false;
-
-  /* open module */
-  CUDAContextScope scope(this);
-
-  string cubin_data;
-  CUresult result;
-
-  if (path_read_text(cubin, cubin_data))
-    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf(
-        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
-  if (path_read_text(filter_cubin, cubin_data))
-    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
-                            filter_cubin.c_str(),
-                            cuewErrorString(result)));
-
-  if (result == CUDA_SUCCESS) {
-    reserve_local_memory(requested_features);
-  }
-
-  load_functions();
-
-  return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
-  /* TODO: load all functions here. */
-  if (functions.loaded) {
-    return;
-  }
-  functions.loaded = true;
-
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
-  int unused_min_blocks;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
-                                               &functions.adaptive_num_threads_per_block,
-                                               functions.adaptive_scale_samples,
-                                               NULL,
-                                               0,
-                                               0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
-  if (use_split_kernel()) {
-    /* Split kernel mostly uses global memory and adaptive compilation,
-     * difficult to predict how much is needed currently. */
-    return;
-  }
-
-  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-   * needed for kernel launches, so that we can reliably figure out when
-   * to allocate scene data in mapped host memory. */
-  CUDAContextScope scope(this);
-
-  size_t total = 0, free_before = 0, free_after = 0;
-  cuMemGetInfo(&free_before, &total);
-
-  /* Get kernel function. */
-  CUfunction cuRender;
-
-  if (requested_features.use_baking) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (requested_features.use_integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
-  /* Launch kernel, using just 1 block appears sufficient to reserve
-   * memory for all multiprocessors. It would be good to do this in
-   * parallel for the multi GPU case still to make it faster. */
-  CUdeviceptr d_work_tiles = 0;
-  uint total_work_size = 0;
-
-  void *args[] = {&d_work_tiles, &total_work_size};
-
-  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-  cuda_assert(cuCtxSynchronize());
-
-  cuMemGetInfo(&free_after, &total);
-  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#  if 0
-  /* For testing mapped host memory, fill up device memory. */
-  const size_t keep_mb = 1024;
-
-  while (free_after > keep_mb * 1024 * 1024LL) {
-    CUdeviceptr tmp;
-    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-    cuMemGetInfo(&free_after, &total);
-  }
-#  endif
-}
-
-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
-  CUDAContextScope scope(this);
-
-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
-  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
-
-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
-
-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-
-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    assert(!"mem_alloc not supported for textures.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    assert(!"mem_alloc not supported for global memory.");
-  }
-  else {
-    generic_alloc(mem);
-  }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS) {
-    assert(!"mem_copy_to not supported for pixels.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
-  }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_copy_from(mem, y, w, h);
-  }
-  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
-    assert(!"mem_copy_from not supported for textures.");
-  }
-  else if (mem.host_pointer) {
-    const size_t size = elem * w * h;
-    const size_t offset = elem * y * w;
-
-    if (mem.device_pointer) {
-      const CUDAContextScope scope(this);
-      cuda_assert(cuMemcpyDtoH(
-          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-    }
-    else {
-      memset((char *)mem.host_pointer + offset, 0, size);
-    }
-  }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-  if (!mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-  }
-  else if (mem.host_pointer) {
-    memset(mem.host_pointer, 0, mem.memory_size());
-  }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_free(mem);
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    generic_free(mem);
-  }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
-  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  CUDAContextScope scope(this);
-  CUdeviceptr mem;
-  size_t bytes;
-
-  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-  // assert(bytes == size);
-  cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
-  if (mem.is_resident(this)) {
-    generic_alloc(mem);
-    generic_copy_to(mem);
-  }
-
-  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
-  if (mem.is_resident(this) && mem.device_pointer) {
-    generic_free(mem);
-  }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
-  CUDAContextScope scope(this);
-
-  /* General variables for both architectures */
-  string bind_name = mem.name;
-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
-  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-  switch (mem.info.extension) {
-    case EXTENSION_REPEAT:
-      address_mode = CU_TR_ADDRESS_MODE_WRAP;
-      break;
-    case EXTENSION_EXTEND:
-      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-      break;
-    case EXTENSION_CLIP:
-      address_mode = CU_TR_ADDRESS_MODE_BORDER;
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  CUfilter_mode filter_mode;
-  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
-    filter_mode = CU_TR_FILTER_MODE_POINT;
-  }
-  else {
-    filter_mode = CU_TR_FILTER_MODE_LINEAR;
-  }
-
-  /* Image Texture Storage */
-  CUarray_format_enum format;
-  switch (mem.data_type) {
-    case TYPE_UCHAR:
-      format = CU_AD_FORMAT_UNSIGNED_INT8;
-      break;
-    case TYPE_UINT16:
-      format = CU_AD_FORMAT_UNSIGNED_INT16;
-      break;
-    case TYPE_UINT:
-      format = CU_AD_FORMAT_UNSIGNED_INT32;
-      break;
-    case TYPE_INT:
-      format = CU_AD_FORMAT_SIGNED_INT32;
-      break;
-    case TYPE_FLOAT:
-      format = CU_AD_FORMAT_FLOAT;
-      break;
-    case TYPE_HALF:
-      format = CU_AD_FORMAT_HALF;
-      break;
-    default:
-      assert(0);
-      return;
-  }
-
-  CUDAMem *cmem = NULL;
-  CUarray array_3d = NULL;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;
-
-  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-
-    if (mem.data_depth > 1) {
-      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
-  }
-  else if (mem.data_depth > 1) {
-    /* 3D texture using array, there is no API for linear memory. */
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-
-    desc.Width = mem.data_width;
-    desc.Height = mem.data_height;
-    desc.Depth = mem.data_depth;
-    desc.Format = format;
-    desc.NumChannels = mem.data_elements;
-    desc.Flags = 0;
-
-    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-    if (!array_3d) {
-      return;
-    }
-
-    CUDA_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-    param.dstArray = array_3d;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    cuda_assert(cuMemcpy3D(&param));
-
-    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-    cmem->array = array_3d;
-  }
-  else if (mem.data_height > 0) {
-    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
-
-    cmem = generic_alloc(mem, dst_size - mem.memory_size());
-    if (!cmem) {
-      return;
-    }
-
-    CUDA_MEMCPY2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
-    cuda_assert(cuMemcpy2DUnaligned(&param));
-  }
-  else {
-    /* 1D texture, using linear memory. */
-    cmem = generic_alloc(mem);
-    if (!cmem) {
-      return;
-    }
-
-    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-  }
-
-  /* Resize once */
-  const uint slot = mem.slot;
-  if (slot >= texture_info.size()) {
-    /* Allocate some slots in advance, to reduce amount
-     * of re-allocations. */
-    texture_info.resize(slot + 128);
-  }
-
-  /* Set Mapping and tag that we need to (re-)upload to device */
-  texture_info[slot] = mem.info;
-  need_texture_info = true;
-
-  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    texture_info[slot].data = (uint64_t)cmem->texobject;
-  }
-  else {
-    texture_info[slot].data = (uint64_t)mem.device_pointer;
-  }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      cuTexObjectDestroy(cmem.texobject);
-    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else if (cmem.array) {
-      /* Free array. */
-      cuArrayDestroy(cmem.array);
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else {
-      lock.unlock();
-      generic_free(mem);
-    }
-  }
-}
-
-#  define CUDA_GET_BLOCKSIZE(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int threads = (int)sqrt((float)threads_per_block); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = ((h) + threads - 1) / threads;
-
-#  define CUDA_LAUNCH_KERNEL(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
-                                           device_ptr guide_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr out_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-  int frame_offset = 0;
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr scale_ptr = 0;
-
-  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-  {
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-    void *calc_difference_args[] = {&guide_ptr,
-                                    &variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &channel_offset,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *update_output_args[] = {&blurDifference,
-                                  &image_ptr,
-                                  &out_ptr,
-                                  &weightAccum,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &channel_offset,
-                                  &r,
-                                  &f};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-  }
-
-  {
-    CUfunction cuNLMNormalize;
-    cuda_assert(
-        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterConstructTransform;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-  void *args[] = {&task->buffer.mem.device_pointer,
-                  &task->tile_info_mem.device_pointer,
-                  &task->storage.transform.device_pointer,
-                  &task->storage.rank.device_pointer,
-                  &task->filter_area,
-                  &task->rect,
-                  &task->radius,
-                  &task->pca_threshold,
-                  &task->buffer.pass_stride,
-                  &task->buffer.frame_stride,
-                  &task->buffer.use_time};
-  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
-                                      device_ptr color_variance_ptr,
-                                      device_ptr scale_ptr,
-                                      int frame,
-                                      DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int r = task->radius;
-  int f = 4;
-  float a = 1.0f;
-  float k_2 = task->nlm_k_2;
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-  cuda_assert(
-      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                        num_shifts);
-
-  void *calc_difference_args[] = {&color_ptr,
-                                  &color_variance_ptr,
-                                  &scale_ptr,
-                                  &difference,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &r,
-                                  &pass_stride,
-                                  &frame_offset,
-                                  &a,
-                                  &k_2};
-  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *construct_gramian_args[] = {&t,
-                                    &blurDifference,
-                                    &task->buffer.mem.device_pointer,
-                                    &task->storage.transform.device_pointer,
-                                    &task->storage.rank.device_pointer,
-                                    &task->storage.XtWX.device_pointer,
-                                    &task->storage.XtWY.device_pointer,
-                                    &task->reconstruction_state.filter_window,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &f,
-                                    &frame_offset,
-                                    &task->buffer.use_time};
-
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  CUfunction cuFinalize;
-  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-  void *finalize_args[] = {&output_ptr,
-                           &task->storage.rank.device_pointer,
-                           &task->storage.XtWX.device_pointer,
-                           &task->storage.XtWY.device_pointer,
-                           &task->filter_area,
-                           &task->reconstruction_state.buffer_params.x,
-                           &task->render_buffer.samples};
-  CUDA_GET_BLOCKSIZE(
-      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
-                                          device_ptr b_ptr,
-                                          device_ptr mean_ptr,
-                                          device_ptr variance_ptr,
-                                          int r,
-                                          int4 rect,
-                                          DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterCombineHalves;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
-                                         device_ptr b_ptr,
-                                         device_ptr sample_variance_ptr,
-                                         device_ptr sv_variance_ptr,
-                                         device_ptr buffer_variance_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDivideShadow;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &a_ptr,
-                  &b_ptr,
-                  &sample_variance_ptr,
-                  &sv_variance_ptr,
-                  &buffer_variance_ptr,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
-                                       int variance_offset,
-                                       device_ptr mean_ptr,
-                                       device_ptr variance_ptr,
-                                       float scale,
-                                       DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterGetFeature;
-  cuda_assert(
-      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &mean_offset,
-                  &variance_offset,
-                  &mean_ptr,
-                  &variance_ptr,
-                  &scale,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
-                                         device_ptr from_ptr,
-                                         device_ptr buffer_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterWriteFeature;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->reconstruction_state.buffer_params,
-                  &task->filter_area,
-                  &from_ptr,
-                  &buffer_ptr,
-                  &out_offset,
-                  &task->rect};
-  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr depth_ptr,
-                                           device_ptr output_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDetectOutliers;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {
-      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
-  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &CUDADevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
-                                          WorkTile *wtile,
-                                          CUdeviceptr d_wtile,
-                                          CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
-  /* These are a series of tiny kernels because there is no grid synchronization
-   * from within a kernel, so multiple kernel launches it is. */
-  uint total_work_size = wtile->h * wtile->w;
-  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->h;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->w;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
-                                        WorkTile *wtile,
-                                        CUdeviceptr d_wtile,
-                                        CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-  uint total_work_size = wtile->h * wtile->w;
-
-  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args,
-                             0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (task.integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  WorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
-                              device_ptr buffer,
-                              device_ptr rgba_byte,
-                              device_ptr rgba_half)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilmConvert;
-  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
-  /* get kernel function */
-  if (rgba_half) {
-    cuda_assert(
-        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-  }
-
-  float sample_scale = 1.0f / (task.sample + 1);
-
-  /* pass in parameters */
-  void *args[] = {&d_rgba,
-                  &d_buffer,
-                  &sample_scale,
-                  &task.x,
-                  &task.y,
-                  &task.w,
-                  &task.h,
-                  &task.offset,
-                  &task.stride};
-
-  /* launch kernel */
-  int threads_per_block;
-  cuda_assert(cuFuncGetAttribute(
-      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-  int xthreads = (int)sqrt(threads_per_block);
-  int ythreads = (int)sqrt(threads_per_block);
-  int xblocks = (task.w + xthreads - 1) / xthreads;
-  int yblocks = (task.h + ythreads - 1) / ythreads;
-
-  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-  cuda_assert(cuLaunchKernel(cuFilmConvert,
-                             xblocks,
-                             yblocks,
-                             1, /* blocks */
-                             xthreads,
-                             ythreads,
-                             1, /* threads */
-                             0,
-                             0,
-                             args,
-                             0));
-
-  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-  cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuShader;
-  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
-  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
-  /* get kernel function */
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-  }
-
-  /* do tasks in smaller chunks, so we can cancel it */
-  const int shader_chunk_size = 65536;
-  const int start = task.shader_x;
-  const int end = task.shader_x + task.shader_w;
-  int offset = task.offset;
-
-  bool canceled = false;
-  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-      int shader_w = min(shader_chunk_size, end - shader_x);
-
-      /* pass in parameters */
-      void *args[8];
-      int arg = 0;
-      args[arg++] = &d_input;
-      args[arg++] = &d_output;
-      args[arg++] = &task.shader_eval_type;
-      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-        args[arg++] = &task.shader_filter;
-      }
-      args[arg++] = &shader_x;
-      args[arg++] = &shader_w;
-      args[arg++] = &offset;
-      args[arg++] = &sample;
-
-      /* launch kernel */
-      int threads_per_block;
-      cuda_assert(cuFuncGetAttribute(
-          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuLaunchKernel(cuShader,
-                                 xblocks,
-                                 1,
-                                 1, /* blocks */
-                                 threads_per_block,
-                                 1,
-                                 1, /* threads */
-                                 0,
-                                 0,
-                                 args,
-                                 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      if (task.get_cancel()) {
-        canceled = true;
-        break;
-      }
-    }
-
-    task.update_progress(NULL);
-  }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-    CUdeviceptr buffer;
-
-    size_t bytes;
-    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-    return buffer;
-  }
-
-  return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-
-    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-  }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
-  PixelMem pmem;
-
-  pmem.w = mem.data_width;
-  pmem.h = mem.data_height;
-
-  CUDAContextScope scope(this);
-
-  glGenBuffers(1, &pmem.cuPBO);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  if (mem.data_type == TYPE_HALF)
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-  else
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &pmem.cuTexId);
-  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-  if (mem.data_type == TYPE_HALF)
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-  else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  CUresult result = cuGraphicsGLRegisterBuffer(
-      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-  if (result == CUDA_SUCCESS) {
-    mem.device_pointer = pmem.cuTexId;
-    pixel_mem_map[mem.device_pointer] = pmem;
-
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    return;
-  }
-  else {
-    /* failed to register buffer, fallback to no interop */
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    background = true;
-  }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
-  PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-  CUDAContextScope scope(this);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-  size_t offset = sizeof(uchar) * 4 * y * w;
-  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-    mem.device_pointer = 0;
-
-    stats.mem_free(mem.device_size);
-    mem.device_size = 0;
-  }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
-                             int y,
-                             int w,
-                             int h,
-                             int width,
-                             int height,
-                             int dx,
-                             int dy,
-                             int dw,
-                             int dh,
-                             bool transparent,
-                             const DeviceDrawParams &draw_params)
-{
-  assert(mem.type == MEM_PIXELS);
-
-  if (!background) {
-    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-    float *vpointer;
-
-    CUDAContextScope scope(this);
-
-    /* for multi devices, this assumes the inefficient method that we allocate
-     * all pixels on the device even though we only render to a subset */
-    size_t offset = 4 * y * w;
-
-    if (mem.data_type == TYPE_HALF)
-      offset *= sizeof(GLhalf);
-    else
-      offset *= sizeof(uint8_t);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF) {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-    }
-    else {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-    }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    if (transparent) {
-      glEnable(GL_BLEND);
-      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-    }
-
-    GLint shader_program;
-    if (use_fallback_shader) {
-      if (!bind_fallback_display_space_shader(dw, dh)) {
-        return;
-      }
-      shader_program = fallback_shader_program;
-    }
-    else {
-      draw_params.bind_display_space_shader_cb();
-      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-    }
-
-    if (!vertex_buffer) {
-      glGenBuffers(1, &vertex_buffer);
-    }
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-    /* invalidate old contents -
-     * avoids stalling if buffer is still waiting in queue to be rendered */
-    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-    if (vpointer) {
-      /* texture coordinate - vertex pair */
-      vpointer[0] = 0.0f;
-      vpointer[1] = 0.0f;
-      vpointer[2] = dx;
-      vpointer[3] = dy;
-
-      vpointer[4] = (float)w / (float)pmem.w;
-      vpointer[5] = 0.0f;
-      vpointer[6] = (float)width + dx;
-      vpointer[7] = dy;
-
-      vpointer[8] = (float)w / (float)pmem.w;
-      vpointer[9] = (float)h / (float)pmem.h;
-      vpointer[10] = (float)width + dx;
-      vpointer[11] = (float)height + dy;
-
-      vpointer[12] = 0.0f;
-      vpointer[13] = (float)h / (float)pmem.h;
-      vpointer[14] = dx;
-      vpointer[15] = (float)height + dy;
-
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-
-    GLuint vertex_array_object;
-    GLuint position_attribute, texcoord_attribute;
-
-    glGenVertexArrays(1, &vertex_array_object);
-    glBindVertexArray(vertex_array_object);
-
-    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-    position_attribute = glGetAttribLocation(shader_program, "pos");
-
-    glEnableVertexAttribArray(texcoord_attribute);
-    glEnableVertexAttribArray(position_attribute);
-
-    glVertexAttribPointer(
-        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-    glVertexAttribPointer(position_attribute,
-                          2,
-                          GL_FLOAT,
-                          GL_FALSE,
-                          4 * sizeof(float),
-                          (const GLvoid *)(sizeof(float) * 2));
-
-    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-    if (use_fallback_shader) {
-      glUseProgram(0);
-    }
-    else {
-      draw_params.unbind_display_space_shader_cb();
-    }
-
-    if (transparent) {
-      glDisable(GL_BLEND);
-    }
-
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    return;
-  }
-
-  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    DeviceRequestedFeatures requested_features;
-    if (use_split_kernel()) {
-      if (split_kernel == NULL) {
-        split_kernel = new CUDASplitKernel(this);
-        split_kernel->load_kernels(requested_features);
-      }
-    }
-
-    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel()) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-        }
-        else {
-          render(task, tile, work_tiles);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-
-        denoise(tile, denoising);
-
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  /* Load texture info. */
-  load_texture_info();
-
-  /* Synchronize all memory copies before executing task. */
-  cuda_assert(cuCtxSynchronize());
-
-  if (task.type == DeviceTask::FILM_CONVERT) {
-    /* must be done in main thread due to opengl access */
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-}
-
-void CUDADevice::task_wait()
-{
-  task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
-  task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#  undef cuda_assert
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        device->set_error( \
-            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
-  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
-  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
-  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
-  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
-  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  const CUDAContextScope scope(device);
-
-  CUfunction func;
-  const CUresult result = cuModuleGetFunction(
-      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
-  if (result != CUDA_SUCCESS) {
-    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
-                                    kernel_name.data(),
-                                    cuewErrorString(result)));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask & /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize CUDA. */
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
+    return;
+  }
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+
+#  ifdef WITH_NANOVDB
+  cflags += " -DWITH_NANOVDB";
+#  endif
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    set_error(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 101) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 10.1 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 to 11.4 are officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    set_error(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string cubin = compile_kernel(kernel_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+  if (result == CUDA_SUCCESS) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    CUDADeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuda_assert(cuMemFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void CUDADevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+  return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  CUDAContextScope scope(this);
+
+  int num_all_devices = 0;
+  cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<CUdevice> gl_devices(num_all_devices);
+  uint num_gl_devices;
+  cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+  for (CUdevice gl_device : gl_devices) {
+    if (gl_device == cuDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+  CUDAContextScope scope(this);
+
+  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/queue.h"
+#  include "device/cuda/util.h"
+#  include "device/device.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+  friend class CUDAContextScope;
+
+ public:
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUDADeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(CUdevice_attribute attribute, int *value);
+  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/graphics_interop.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+    : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+    const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  need_clear_ = destination.need_clear;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+
+  const CUresult result = cuGraphicsGLRegisterBuffer(
+      &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+  if (!cu_graphics_resource_) {
+    return 0;
+  }
+
+  CUDAContextScope scope(device_);
+
+  CUdeviceptr cu_buffer;
+  size_t bytes;
+
+  cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+  cuda_device_assert(
+      device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+  if (need_clear_) {
+    cuda_device_assert(
+        device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+    need_clear_ = false;
+  }
+
+  return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+  CUDAContextScope scope(device_);
+
+  cuda_device_assert(device_,
+                     cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+  CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~CUDADeviceGraphicsInterop();
+
+  CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  CUDADeviceQueue *queue_ = nullptr;
+  CUDADevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  /* The destination was requested to be cleared. */
+  bool need_clear_ = false;
+
+  CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..0ed20ddf8e6
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+  CUmodule cuModule = device->cuModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    CUDADeviceKernel &kernel = kernels_[i];
+
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    cuda_device_assert(device,
+                       cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+    if (kernel.function) {
+      cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+      cuda_device_assert(
+          device,
+          cuOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+  CUfunction function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+  void load(CUDADevice *device);
+  const CUDADeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/queue.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/graphics_interop.h"
+#  include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  CUDAContextScope scope(cuda_device_);
+  cuda_device_->load_texture_info();
+  cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+  debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  cuda_device_assert(cuda_device_,
+                     cuLaunchKernel(cuda_kernel.function,
+                                    num_blocks,
+                                    1,
+                                    1,
+                                    num_threads_per_block,
+                                    1,
+                                    1,
+                                    shared_mem_bytes,
+                                    cuda_stream_,
+                                    args,
+                                    0));
+
+  return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  debug_synchronize();
+
+  return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyDtoHAsync(
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+  return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+  CUDADeviceQueue(CUDADevice *device);
+  ~CUDADeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual CUstream stream()
+  {
+    return cuda_stream_;
+  }
+
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  CUDADevice *cuda_device_;
+  CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/util.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+#  define cuda_device_assert(cuda_device, stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        cuda_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+#  ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+#  endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
 #include "bvh/bvh2.h"
 
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
 
 #include "util/util_foreach.h"
 #include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
 bool Device::need_types_update = true;
 bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
 
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
-  os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
-  os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
-  /* TODO(sergey): Decode bitflag into list of names. */
-  os << "Nodes features: " << requested_features.nodes_features << std::endl;
-  os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
-  os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
-     << std::endl;
-  os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
-     << std::endl;
-  os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
-  os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
-  os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
-  os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
-     << std::endl;
-  os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
-     << std::endl;
-  os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
-     << std::endl;
-  os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
-     << std::endl;
-  os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
-  os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
-     << std::endl;
-  os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
-     << std::endl;
-  return os;
-}
-
 /* Device */
 
 Device::~Device() noexcept(false)
 {
-  if (!background) {
-    if (vertex_buffer != 0) {
-      glDeleteBuffers(1, &vertex_buffer);
-    }
-    if (fallback_shader_program != 0) {
-      glDeleteProgram(fallback_shader_program);
-    }
-  }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
-    "#version 330\n"
-    "uniform vec2 fullscreen;\n"
-    "in vec2 texCoord;\n"
-    "in vec2 pos;\n"
-    "out vec2 texCoord_interp;\n"
-    "\n"
-    "vec2 normalize_coordinates()\n"
-    "{\n"
-    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-    "}\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-    "   texCoord_interp = texCoord;\n"
-    "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
-    "#version 330\n"
-    "uniform sampler2D image_texture;\n"
-    "in vec2 texCoord_interp;\n"
-    "out vec4 fragColor;\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   fragColor = texture(image_texture, texCoord_interp);\n"
-    "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
-  LOG(ERROR) << "Shader: " << task << " error:";
-  LOG(ERROR) << "===== shader string ====";
-
-  stringstream stream(code);
-  string partial;
-
-  int line = 1;
-  while (getline(stream, partial, '\n')) {
-    if (line < 10) {
-      LOG(ERROR) << " " << line << " " << partial;
-    }
-    else {
-      LOG(ERROR) << line << " " << partial;
-    }
-    line++;
-  }
-  LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
-  GLint status;
-  GLchar log[5000];
-  GLsizei length = 0;
-  GLuint program = 0;
-
-  struct Shader {
-    const char *source;
-    GLenum type;
-  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
-                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
-  program = glCreateProgram();
-
-  for (int i = 0; i < 2; i++) {
-    GLuint shader = glCreateShader(shaders[i].type);
-
-    string source_str = shaders[i].source;
-    const char *c_str = source_str.c_str();
-
-    glShaderSource(shader, 1, &c_str, NULL);
-    glCompileShader(shader);
-
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
-    if (!status) {
-      glGetShaderInfoLog(shader, sizeof(log), &length, log);
-      shader_print_errors("compile", log, c_str);
-      return 0;
-    }
-
-    glAttachShader(program, shader);
-  }
-
-  /* Link output. */
-  glBindFragDataLocation(program, 0, "fragColor");
-
-  /* Link and error check. */
-  glLinkProgram(program);
-
-  glGetProgramiv(program, GL_LINK_STATUS, &status);
-  if (!status) {
-    glGetShaderInfoLog(program, sizeof(log), &length, log);
-    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
-    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
-    return 0;
-  }
-
-  return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
-  if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
-    return false;
-  }
-
-  if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
-    fallback_shader_program = bind_fallback_shader();
-    fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
-    if (fallback_shader_program == 0) {
-      return false;
-    }
-
-    glUseProgram(fallback_shader_program);
-    image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
-    if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
-      return false;
-    }
-
-    fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
-    if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
-      return false;
-    }
-
-    fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
-  }
-
-  /* Run this every time. */
-  glUseProgram(fallback_shader_program);
-  glUniform1i(image_texture_location, 0);
-  glUniform2f(fullscreen_location, width, height);
-  return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
-                         int y,
-                         int w,
-                         int h,
-                         int width,
-                         int height,
-                         int dx,
-                         int dy,
-                         int dw,
-                         int dh,
-                         bool transparent,
-                         const DeviceDrawParams &draw_params)
-{
-  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
-  assert(rgba.type == MEM_PIXELS);
-  mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
-  GLuint texid;
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &texid);
-  glBindTexture(GL_TEXTURE_2D, texid);
-
-  if (rgba.data_type == TYPE_HALF) {
-    GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
-  }
-  else {
-    uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
-  }
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-  if (transparent) {
-    glEnable(GL_BLEND);
-    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-  }
-
-  GLint shader_program;
-  if (use_fallback_shader) {
-    if (!bind_fallback_display_space_shader(dw, dh)) {
-      return;
-    }
-    shader_program = fallback_shader_program;
-  }
-  else {
-    draw_params.bind_display_space_shader_cb();
-    glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-  }
-
-  if (!vertex_buffer) {
-    glGenBuffers(1, &vertex_buffer);
-  }
-
-  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
-   */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-  if (vpointer) {
-    /* texture coordinate - vertex pair */
-    vpointer[0] = 0.0f;
-    vpointer[1] = 0.0f;
-    vpointer[2] = dx;
-    vpointer[3] = dy;
-
-    vpointer[4] = 1.0f;
-    vpointer[5] = 0.0f;
-    vpointer[6] = (float)width + dx;
-    vpointer[7] = dy;
-
-    vpointer[8] = 1.0f;
-    vpointer[9] = 1.0f;
-    vpointer[10] = (float)width + dx;
-    vpointer[11] = (float)height + dy;
-
-    vpointer[12] = 0.0f;
-    vpointer[13] = 1.0f;
-    vpointer[14] = dx;
-    vpointer[15] = (float)height + dy;
-
-    if (vertex_buffer) {
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-  }
-
-  GLuint vertex_array_object;
-  GLuint position_attribute, texcoord_attribute;
-
-  glGenVertexArrays(1, &vertex_array_object);
-  glBindVertexArray(vertex_array_object);
-
-  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-  position_attribute = glGetAttribLocation(shader_program, "pos");
-
-  glEnableVertexAttribArray(texcoord_attribute);
-  glEnableVertexAttribArray(position_attribute);
-
-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
-
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-  if (vertex_buffer) {
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-  }
-
-  if (use_fallback_shader) {
-    glUseProgram(0);
-  }
-  else {
-    draw_params.unbind_display_space_shader_cb();
-  }
-
-  glDeleteVertexArrays(1, &vertex_array_object);
-  glBindTexture(GL_TEXTURE_2D, 0);
-  glDeleteTextures(1, &texid);
-
-  if (transparent) {
-    glDisable(GL_BLEND);
-  }
 }
 
 void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
   }
 }
 
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
 #ifdef WITH_MULTI
   if (!info.multi_devices.empty()) {
     /* Always create a multi device when info contains multiple devices.
      * This is done so that the type can still be e.g. DEVICE_CPU to indicate
      * that it is a homogeneous collection of devices, which simplifies checks. */
-    return device_multi_create(info, stats, profiler, background);
+    return device_multi_create(info, stats, profiler);
   }
 #endif
 
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
 
   switch (info.type) {
     case DEVICE_CPU:
-      device = device_cpu_create(info, stats, profiler, background);
+      device = device_cpu_create(info, stats, profiler);
       break;
 #ifdef WITH_CUDA
     case DEVICE_CUDA:
       if (device_cuda_init())
-        device = device_cuda_create(info, stats, profiler, background);
+        device = device_cuda_create(info, stats, profiler);
       break;
 #endif
 #ifdef WITH_OPTIX
     case DEVICE_OPTIX:
       if (device_optix_init())
-        device = device_optix_create(info, stats, profiler, background);
-      break;
-#endif
-#ifdef WITH_NETWORK
-    case DEVICE_NETWORK:
-      device = device_network_create(info, stats, profiler, "127.0.0.1");
-      break;
-#endif
-#ifdef WITH_OPENCL
-    case DEVICE_OPENCL:
-      if (device_opencl_init())
-        device = device_opencl_create(info, stats, profiler, background);
+        device = device_optix_create(info, stats, profiler);
       break;
 #endif
     default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   }
 
   if (device == NULL) {
-    device = device_dummy_create(info, stats, profiler, background);
+    device = device_dummy_create(info, stats, profiler);
   }
 
   return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CUDA;
   else if (strcmp(name, "OPTIX") == 0)
     return DEVICE_OPTIX;
-  else if (strcmp(name, "OPENCL") == 0)
-    return DEVICE_OPENCL;
-  else if (strcmp(name, "NETWORK") == 0)
-    return DEVICE_NETWORK;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
 
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
     return "CUDA";
   else if (type == DEVICE_OPTIX)
     return "OPTIX";
-  else if (type == DEVICE_OPENCL)
-    return "OPENCL";
-  else if (type == DEVICE_NETWORK)
-    return "NETWORK";
   else if (type == DEVICE_MULTI)
     return "MULTI";
 
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
-#ifdef WITH_OPENCL
-  types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
-  types.push_back(DEVICE_NETWORK);
-#endif
   return types;
 }
 
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   thread_scoped_lock lock(device_mutex);
   vector<DeviceInfo> devices;
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
-      if (device_opencl_init()) {
-        device_opencl_info(opencl_devices);
-      }
-      devices_initialized_mask |= DEVICE_MASK_OPENCL;
-    }
-    foreach (DeviceInfo &info, opencl_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
 #if defined(WITH_CUDA) || defined(WITH_OPTIX)
   if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
     if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
     }
   }
 
-#ifdef WITH_NETWORK
-  if (mask & DEVICE_MASK_NETWORK) {
-    if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
-      device_network_info(network_devices);
-      devices_initialized_mask |= DEVICE_MASK_NETWORK;
-    }
-    foreach (DeviceInfo &info, network_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
   return devices;
 }
 
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
     capabilities += device_cpu_capabilities() + "\n";
   }
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (device_opencl_init()) {
-      capabilities += "\nOpenCL device capabilities:\n";
-      capabilities += device_opencl_capabilities();
-    }
-  }
-#endif
-
 #ifdef WITH_CUDA
   if (mask & DEVICE_MASK_CUDA) {
     if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = subdevices.front().type;
+  info.type = DEVICE_NONE;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
 
   info.has_half_images = true;
   info.has_nanovdb = true;
-  info.has_volume_decoupled = true;
-  info.has_branched_path = true;
-  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.id += device.id;
 
     /* Set device type to MULTI if subdevices are not of a common type. */
-    if (device.type != info.type) {
+    if (info.type == DEVICE_NONE) {
+      info.type = device.type;
+    }
+    else if (device.type != info.type) {
       info.type = DEVICE_MULTI;
     }
 
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_nanovdb &= device.has_nanovdb;
-    info.has_volume_decoupled &= device.has_volume_decoupled;
-    info.has_branched_path &= device.has_branched_path;
-    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
-  opencl_devices.free_memory();
   cpu_devices.free_memory();
-  network_devices.free_memory();
 }
 
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
 {
-  assert(denoising_devices.empty());
-
-  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
-    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
-    if (!optix_devices.empty()) {
-      /* Convert to a special multi device with separate denoising devices. */
-      if (multi_devices.empty()) {
-        multi_devices.push_back(*this);
-      }
-
-      /* Try to use the same physical devices for denoising. */
-      for (const DeviceInfo &cuda_device : multi_devices) {
-        if (cuda_device.type == DEVICE_CUDA) {
-          for (const DeviceInfo &optix_device : optix_devices) {
-            if (cuda_device.num == optix_device.num) {
-              id += optix_device.id;
-              denoising_devices.push_back(optix_device);
-              break;
-            }
-          }
-        }
-      }
-
-      if (denoising_devices.empty()) {
-        /* Simply use the first available OptiX device. */
-        const DeviceInfo optix_device = optix_devices.front();
-        id += optix_device.id; /* Uniquely identify this special multi device. */
-        denoising_devices.push_back(optix_device);
-      }
+  LOG(FATAL) << "Device does not support queues.";
+  return nullptr;
+}
 
-      denoisers = denoiser_type;
-    }
-  }
-  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
-    /* Convert to a special multi device with separate denoising devices. */
-    if (multi_devices.empty()) {
-      multi_devices.push_back(*this);
-    }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
+}
 
-    /* Add CPU denoising devices. */
-    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
-    denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+}
 
-    denoisers = denoiser_type;
-  }
+void *Device::get_cpu_osl_memory()
+{
+  return nullptr;
 }
 
+/* DeviceInfo */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..02b6edb56d0 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
 
 #include "bvh/bvh_params.h"
 
+#include "device/device_denoise.h"
 #include "device/device_memory.h"
-#include "device/device_task.h"
 
+#include "util/util_function.h"
 #include "util/util_list.h"
+#include "util/util_logging.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class BVH;
+class DeviceQueue;
 class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
 
 /* Device Types */
 
 enum DeviceType {
   DEVICE_NONE = 0,
   DEVICE_CPU,
-  DEVICE_OPENCL,
   DEVICE_CUDA,
-  DEVICE_NETWORK,
   DEVICE_MULTI,
   DEVICE_OPTIX,
   DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
-  DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
-  DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
 
-enum DeviceKernelStatus {
-  DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
-  DEVICE_KERNEL_USING_FEATURE_KERNEL,
-  DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
-  DEVICE_KERNEL_UNKNOWN,
-};
-
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
 class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;               /* GPU is used as a display device. */
-  bool has_half_images;              /* Support half-float textures. */
-  bool has_nanovdb;                  /* Support NanoVDB volumes. */
-  bool has_volume_decoupled;         /* Decoupled volume shading. */
-  bool has_branched_path;            /* Supports branched path tracing. */
-  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
-  bool has_osl;                      /* Support Open Shading Language. */
-  bool use_split_kernel;             /* Use split or mega kernel. */
-  bool has_profiling;                /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
-  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
+  bool display_device;        /* GPU is used as a display device. */
+  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_half_images;       /* Support half-float textures. */
+  bool has_osl;               /* Support Open Shading Language. */
+  bool has_profiling;         /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;         /* Device supports GPU queue. */
+  DenoiserTypeMask denoisers; /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
-  vector<DeviceInfo> denoising_devices;
   string error_msg;
 
   DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_nanovdb = false;
-    has_volume_decoupled = false;
-    has_branched_path = true;
-    has_adaptive_stop_per_sample = false;
     has_osl = false;
-    use_split_kernel = false;
     has_profiling = false;
     has_peer_memory = false;
+    has_gpu_queue = false;
     denoisers = DENOISER_NONE;
   }
 
-  bool operator==(const DeviceInfo &info)
+  bool operator==(const DeviceInfo &info) const
   {
     /* Multiple Devices with the same ID would be very bad. */
     assert(id != info.id ||
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
-
-  /* Add additional devices needed for the specified denoiser. */
-  void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
-  /* Use experimental feature set. */
-  bool experimental;
-
-  /* Selective nodes compilation. */
-
-  /* Identifier of a node group up to which all the nodes needs to be
-   * compiled in. Nodes from higher group indices will be ignores.
-   */
-  int max_nodes_group;
-
-  /* Features bitfield indicating which features from the requested group
-   * will be compiled in. Nodes which corresponds to features which are not
-   * in this bitfield will be ignored even if they're in the requested group.
-   */
-  int nodes_features;
-
-  /* BVH/sampling kernel features. */
-  bool use_hair;
-  bool use_hair_thick;
-  bool use_object_motion;
-  bool use_camera_motion;
-
-  /* Denotes whether baking functionality is needed. */
-  bool use_baking;
-
-  /* Use subsurface scattering materials. */
-  bool use_subsurface;
-
-  /* Use volume materials. */
-  bool use_volume;
-
-  /* Use branched integrator. */
-  bool use_integrator_branched;
-
-  /* Use OpenSubdiv patch evaluation */
-  bool use_patch_evaluation;
-
-  /* Use Transparent shadows */
-  bool use_transparent;
-
-  /* Use various shadow tricks, such as shadow catcher. */
-  bool use_shadow_tricks;
-
-  /* Per-uber shader usage flags. */
-  bool use_principled;
-
-  /* Denoising features. */
-  bool use_denoising;
-
-  /* Use raytracing in shaders. */
-  bool use_shader_raytrace;
-
-  /* Use true displacement */
-  bool use_true_displacement;
-
-  /* Use background lights */
-  bool use_background_light;
-
-  DeviceRequestedFeatures()
-  {
-    /* TODO(sergey): Find more meaningful defaults. */
-    max_nodes_group = 0;
-    nodes_features = 0;
-    use_hair = false;
-    use_hair_thick = false;
-    use_object_motion = false;
-    use_camera_motion = false;
-    use_baking = false;
-    use_subsurface = false;
-    use_volume = false;
-    use_integrator_branched = false;
-    use_patch_evaluation = false;
-    use_transparent = false;
-    use_shadow_tricks = false;
-    use_principled = false;
-    use_denoising = false;
-    use_shader_raytrace = false;
-    use_true_displacement = false;
-    use_background_light = false;
-  }
-
-  bool modified(const DeviceRequestedFeatures &requested_features)
-  {
-    return !(max_nodes_group == requested_features.max_nodes_group &&
-             nodes_features == requested_features.nodes_features &&
-             use_hair == requested_features.use_hair &&
-             use_hair_thick == requested_features.use_hair_thick &&
-             use_object_motion == requested_features.use_object_motion &&
-             use_camera_motion == requested_features.use_camera_motion &&
-             use_baking == requested_features.use_baking &&
-             use_subsurface == requested_features.use_subsurface &&
-             use_volume == requested_features.use_volume &&
-             use_integrator_branched == requested_features.use_integrator_branched &&
-             use_patch_evaluation == requested_features.use_patch_evaluation &&
-             use_transparent == requested_features.use_transparent &&
-             use_shadow_tricks == requested_features.use_shadow_tricks &&
-             use_principled == requested_features.use_principled &&
-             use_denoising == requested_features.use_denoising &&
-             use_shader_raytrace == requested_features.use_shader_raytrace &&
-             use_true_displacement == requested_features.use_true_displacement &&
-             use_background_light == requested_features.use_background_light);
-  }
-
-  /* Convert the requested features structure to a build options,
-   * which could then be passed to compilers.
-   */
-  string get_build_options() const
-  {
-    string build_options = "";
-    if (experimental) {
-      build_options += "-D__KERNEL_EXPERIMENTAL__ ";
-    }
-    build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
-    build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
-    if (!use_hair) {
-      build_options += " -D__NO_HAIR__";
-    }
-    if (!use_object_motion) {
-      build_options += " -D__NO_OBJECT_MOTION__";
-    }
-    if (!use_camera_motion) {
-      build_options += " -D__NO_CAMERA_MOTION__";
-    }
-    if (!use_baking) {
-      build_options += " -D__NO_BAKING__";
-    }
-    if (!use_volume) {
-      build_options += " -D__NO_VOLUME__";
-    }
-    if (!use_subsurface) {
-      build_options += " -D__NO_SUBSURFACE__";
-    }
-    if (!use_integrator_branched) {
-      build_options += " -D__NO_BRANCHED_PATH__";
-    }
-    if (!use_patch_evaluation) {
-      build_options += " -D__NO_PATCH_EVAL__";
-    }
-    if (!use_transparent && !use_volume) {
-      build_options += " -D__NO_TRANSPARENT__";
-    }
-    if (!use_shadow_tricks) {
-      build_options += " -D__NO_SHADOW_TRICKS__";
-    }
-    if (!use_principled) {
-      build_options += " -D__NO_PRINCIPLED__";
-    }
-    if (!use_denoising) {
-      build_options += " -D__NO_DENOISING__";
-    }
-    if (!use_shader_raytrace) {
-      build_options += " -D__NO_SHADER_RAYTRACE__";
-    }
-    return build_options;
-  }
 };
 
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
 /* Device */
 
-struct DeviceDrawParams {
-  function<void()> bind_display_space_shader_cb;
-  function<void()> unbind_display_space_shader_cb;
-};
-
 class Device {
   friend class device_sub_ptr;
 
  protected:
-  enum {
-    FALLBACK_SHADER_STATUS_NONE = 0,
-    FALLBACK_SHADER_STATUS_ERROR,
-    FALLBACK_SHADER_STATUS_SUCCESS,
-  };
-
-  Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
-      : background(background),
-        vertex_buffer(0),
-        fallback_status(FALLBACK_SHADER_STATUS_NONE),
-        fallback_shader_program(0),
-        info(info_),
-        stats(stats_),
-        profiler(profiler_)
+  Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : info(info_), stats(stats_), profiler(profiler_)
   {
   }
 
-  bool background;
   string error_msg;
 
-  /* used for real time display */
-  unsigned int vertex_buffer;
-  int fallback_status, fallback_shader_program;
-  int image_texture_location, fullscreen_location;
-
-  bool bind_fallback_display_space_shader(const float width, const float height);
-
   virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
   {
     /* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
   Stats &stats;
   Profiler &profiler;
 
-  /* memory alignment */
-  virtual int mem_sub_ptr_alignment()
-  {
-    return MIN_ALIGNMENT_CPU_DATA_TYPES;
-  }
-
   /* constant memory */
   virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
-  /* open shading language, only for CPU device */
-  virtual void *osl_memory()
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
-  virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+  virtual bool load_kernels(uint /*kernel_features*/)
   {
     return true;
   }
 
-  /* Wait for device to become available to upload data and receive tasks
-   * This method is used by the OpenCL device to load the
-   * optimized kernels or when not (yet) available load the
-   * generic kernels (only during foreground rendering) */
-  virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    return true;
-  }
-  /* Check if there are 'better' kernels available to be used
-   * We can switch over to these kernels
-   * This method is used to determine if we can switch the preview kernels
-   * to regular kernels */
-  virtual DeviceKernelStatus get_active_kernel_switch_state()
-  {
-    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-  }
+  /* GPU device only functions.
+   * These may not be used on CPU or multi-devices. */
 
-  /* tasks */
-  virtual int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
+  /* Create new queue for executing kernels in. */
+  virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+  /* CPU device only functions.
+   * These may not be used on GPU or multi-devices. */
 
-  virtual void task_add(DeviceTask &task) = 0;
-  virtual void task_wait() = 0;
-  virtual void task_cancel() = 0;
-
-  /* opengl drawing */
-  virtual void draw_pixels(device_memory &mem,
-                           int y,
-                           int w,
-                           int h,
-                           int width,
-                           int height,
-                           int dx,
-                           int dy,
-                           int dw,
-                           int dh,
-                           bool transparent,
-                           const DeviceDrawParams &draw_params);
+  /* Get CPU kernel functions for native instruction set. */
+  virtual const CPUKernels *get_cpu_kernels() const;
+  /* Get kernel globals to pass to kernels. */
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+  /* Get OpenShadingLanguage memory buffer. */
+  virtual void *get_cpu_osl_memory();
 
   /* acceleration structure building */
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
   /* OptiX specific destructor. */
   virtual void release_optix_bvh(BVH * /*bvh*/){};
 
-#ifdef WITH_NETWORK
-  /* networking */
-  void server_run();
-#endif
-
   /* multi device */
-  virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
-  {
-  }
   virtual int device_number(Device * /*sub_device*/)
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
 
   virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
   {
@@ -460,11 +208,47 @@ class Device {
     return false;
   }
 
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Check display si to be updated using graphics interoperability.
+   * The interoperability can not be used is it is not supported by the device. But the device
+   * might also force disable the interoperability if it detects that it will be slower than
+   * copying pixels from the render buffer. */
+  virtual bool should_use_graphics_interop()
+  {
+    return false;
+  }
+
+  /* Buffer denoising. */
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+  {
+    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+    return false;
+  }
+
+  virtual DeviceQueue *get_denoise_queue()
+  {
+    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Sub-devices */
+
+  /* Run given callback for every individual device which will be handling rendering.
+   * For the single device the callback is called for the device itself. For the multi-device the
+   * callback is only called for the sub-devices. */
+  virtual void foreach_device(const function<void(Device *)> &callback)
+  {
+    callback(this);
+  }
+
   /* static */
-  static Device *create(DeviceInfo &info,
-                        Stats &stats,
-                        Profiler &profiler,
-                        bool background = true);
+  static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
 
   static DeviceType type_from_string(const char *name);
   static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
-  static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
-  static vector<DeviceInfo> network_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-#  include "util/util_windows.h"
-#  include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-#  include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
-  KernelFunctions()
-  {
-    kernel = (F)NULL;
-  }
-
-  KernelFunctions(
-      F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
-  {
-    const char *architecture_name = "default";
-    kernel = kernel_default;
-
-    /* Silence potential warnings about unused variables
-     * when compiling without some architectures. */
-    (void)kernel_sse2;
-    (void)kernel_sse3;
-    (void)kernel_sse41;
-    (void)kernel_avx;
-    (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      architecture_name = "AVX2";
-      kernel = kernel_avx2;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-        if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
-      architecture_name = "AVX";
-      kernel = kernel_avx;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-        if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-      architecture_name = "SSE4.1";
-      kernel = kernel_sse41;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-        if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
-      architecture_name = "SSE3";
-      kernel = kernel_sse3;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-        if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      architecture_name = "SSE2";
-      kernel = kernel_sse2;
-    }
-#else
-    {
-      /* Dummy to prevent the architecture if below become
-       * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-       * is not defined. */
-    }
-#endif
-
-    if (strcmp(architecture_name, logged_architecture) != 0) {
-      VLOG(1) << "Will be using " << architecture_name << " kernels.";
-      logged_architecture = architecture_name;
-    }
-  }
-
-  inline F operator()() const
-  {
-    assert(kernel);
-    return kernel;
-  }
-
- protected:
-  F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
-  CPUDevice *device;
-
- public:
-  explicit CPUSplitKernel(CPUDevice *device);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
-  TaskPool task_pool;
-  KernelGlobals kernel_globals;
-
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-#ifdef WITH_OSL
-  OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
-  oidn::DeviceRef oidn_device;
-  oidn::FilterRef oidn_filter;
-#endif
-  thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
-  RTCScene embree_scene = NULL;
-  RTCDevice embree_device;
-#endif
-
-  bool use_split_kernel;
-
-  DeviceRequestedFeatures requested_features;
-
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_half_float_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_byte_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
-      shader_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
-      filter_divide_shadow_kernel;
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
-      filter_get_feature_kernel;
-  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
-      filter_write_feature_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_detect_outliers_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_combine_halves_kernel;
-
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
-      filter_nlm_calc_difference_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
-      filter_nlm_update_output_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
-  KernelFunctions<void (*)(
-      float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
-      filter_construct_transform_kernel;
-  KernelFunctions<void (*)(int,
-                           int,
-                           int,
-                           float *,
-                           float *,
-                           float *,
-                           int *,
-                           float *,
-                           float3 *,
-                           int *,
-                           int *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           bool)>
-      filter_nlm_construct_gramian_kernel;
-  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
-      filter_finalize_kernel;
-
-  KernelFunctions<void (*)(KernelGlobals *,
-                           ccl_constant KernelData *,
-                           ccl_global void *,
-                           int,
-                           ccl_global char *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           ccl_global int *,
-                           int,
-                           ccl_global char *,
-                           ccl_global unsigned int *,
-                           unsigned int,
-                           ccl_global float *)>
-      data_init_kernel;
-  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
-  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
-      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
-  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
-        REGISTER_KERNEL(path_trace),
-        REGISTER_KERNEL(convert_to_half_float),
-        REGISTER_KERNEL(convert_to_byte),
-        REGISTER_KERNEL(shader),
-        REGISTER_KERNEL(bake),
-        REGISTER_KERNEL(filter_divide_shadow),
-        REGISTER_KERNEL(filter_get_feature),
-        REGISTER_KERNEL(filter_write_feature),
-        REGISTER_KERNEL(filter_detect_outliers),
-        REGISTER_KERNEL(filter_combine_halves),
-        REGISTER_KERNEL(filter_nlm_calc_difference),
-        REGISTER_KERNEL(filter_nlm_blur),
-        REGISTER_KERNEL(filter_nlm_calc_weight),
-        REGISTER_KERNEL(filter_nlm_update_output),
-        REGISTER_KERNEL(filter_nlm_normalize),
-        REGISTER_KERNEL(filter_construct_transform),
-        REGISTER_KERNEL(filter_nlm_construct_gramian),
-        REGISTER_KERNEL(filter_finalize),
-        REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
-  {
-    if (info.cpu_threads == 0) {
-      info.cpu_threads = TaskScheduler::num_threads();
-    }
-
-#ifdef WITH_OSL
-    kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
-    embree_device = rtcNewDevice("verbose=0");
-#endif
-    use_split_kernel = DebugFlags().cpu.split_kernel;
-    if (use_split_kernel) {
-      VLOG(1) << "Will be using split kernel.";
-    }
-    need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
-  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
-      KERNEL_FUNCTIONS(name))
-    REGISTER_SPLIT_KERNEL(path_init);
-    REGISTER_SPLIT_KERNEL(scene_intersect);
-    REGISTER_SPLIT_KERNEL(lamp_emission);
-    REGISTER_SPLIT_KERNEL(do_volume);
-    REGISTER_SPLIT_KERNEL(queue_enqueue);
-    REGISTER_SPLIT_KERNEL(indirect_background);
-    REGISTER_SPLIT_KERNEL(shader_setup);
-    REGISTER_SPLIT_KERNEL(shader_sort);
-    REGISTER_SPLIT_KERNEL(shader_eval);
-    REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
-    REGISTER_SPLIT_KERNEL(subsurface_scatter);
-    REGISTER_SPLIT_KERNEL(direct_lighting);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
-    REGISTER_SPLIT_KERNEL(enqueue_inactive);
-    REGISTER_SPLIT_KERNEL(next_iteration_setup);
-    REGISTER_SPLIT_KERNEL(indirect_subsurface);
-    REGISTER_SPLIT_KERNEL(buffer_update);
-    REGISTER_SPLIT_KERNEL(adaptive_stopping);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
-    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
-  }
-
-  ~CPUDevice()
-  {
-#ifdef WITH_EMBREE
-    rtcReleaseDevice(embree_device);
-#endif
-    task_pool.cancel();
-    texture_info.free();
-  }
-
-  virtual bool show_samples() const override
-  {
-    return (info.cpu_threads == 1);
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
-    bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
-    return bvh_layout_mask;
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  virtual void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else if (mem.type == MEM_GLOBAL) {
-      assert(!"mem_alloc not supported for global memory.");
-    }
-    else {
-      if (mem.name) {
-        VLOG(1) << "Buffer allocate: " << mem.name << ", "
-                << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                << string_human_readable_size(mem.memory_size()) << ")";
-      }
-
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
-        void *data = util_aligned_malloc(mem.memory_size(), alignment);
-        mem.device_pointer = (device_ptr)data;
-      }
-      else {
-        mem.device_pointer = (device_ptr)mem.host_pointer;
-      }
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-    }
-  }
-
-  virtual void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-      global_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-      tex_alloc((device_texture &)mem);
-    }
-    else if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else {
-      if (!mem.device_pointer) {
-        mem_alloc(mem);
-      }
-
-      /* copy is no-op */
-    }
-  }
-
-  virtual void mem_copy_from(
-      device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
-  {
-    /* no-op */
-  }
-
-  virtual void mem_zero(device_memory &mem) override
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.device_pointer) {
-      memset((void *)mem.device_pointer, 0, mem.memory_size());
-    }
-  }
-
-  virtual void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-    }
-    else if (mem.device_pointer) {
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        util_aligned_free((void *)mem.device_pointer);
-      }
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override
-  {
-#if WITH_EMBREE
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update scene handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      data->bvh.scene = embree_scene;
-    }
-#endif
-    kernel_const_copy(&kernel_globals, name, host, size);
-  }
-
-  void global_alloc(device_memory &mem)
-  {
-    VLOG(1) << "Global memory allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void global_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void tex_alloc(device_texture &mem)
-  {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    const uint slot = mem.slot;
-    if (slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount of re-allocations. */
-      texture_info.resize(slot + 128);
-    }
-
-    texture_info[slot] = mem.info;
-    texture_info[slot].data = (uint64_t)mem.host_pointer;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_texture &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-      need_texture_info = true;
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-#ifdef WITH_OSL
-    return &osl_globals;
-#else
-    return NULL;
-#endif
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-#ifdef WITH_EMBREE
-    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
-        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
-      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
-      if (refit) {
-        bvh_embree->refit(progress);
-      }
-      else {
-        bvh_embree->build(progress, &stats, embree_device);
-      }
-
-      if (bvh->params.top_level) {
-        embree_scene = bvh_embree->scene;
-      }
-    }
-    else
-#endif
-      Device::build_bvh(bvh, progress, refit);
-  }
-
-  void thread_run(DeviceTask &task)
-  {
-    if (task.type == DeviceTask::RENDER)
-      thread_render(task);
-    else if (task.type == DeviceTask::SHADER)
-      thread_shader(task);
-    else if (task.type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(task);
-    else if (task.type == DeviceTask::DENOISE_BUFFER)
-      thread_denoise(task);
-  }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
-    int4 rect = task->rect;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int w = align_up(rect.z - rect.x, 4);
-    int h = rect.w - rect.y;
-    int stride = task->buffer.stride;
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *blurDifference = temporary_mem;
-    float *difference = temporary_mem + task->buffer.pass_stride;
-    float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
-    memset(weightAccum, 0, sizeof(float) * w * h);
-    memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {
-          max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)guide_ptr,
-                                          (float *)variance_ptr,
-                                          NULL,
-                                          difference,
-                                          local_rect,
-                                          w,
-                                          channel_offset,
-                                          0,
-                                          a,
-                                          k_2);
-
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-      filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
-      filter_nlm_update_output_kernel()(dx,
-                                        dy,
-                                        blurDifference,
-                                        (float *)image_ptr,
-                                        difference,
-                                        (float *)out_ptr,
-                                        weightAccum,
-                                        local_rect,
-                                        channel_offset,
-                                        stride,
-                                        f);
-    }
-
-    int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
-    filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
-    return true;
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
-                                            task->tile_info,
-                                            x + task->filter_area.x,
-                                            y + task->filter_area.y,
-                                            y * task->filter_area.z + x,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            &task->rect.x,
-                                            task->buffer.pass_stride,
-                                            task->buffer.frame_stride,
-                                            task->buffer.use_time,
-                                            task->radius,
-                                            task->pca_threshold);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *difference = temporary_mem;
-    float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
-    int r = task->radius;
-    int frame_offset = frame * task->buffer.frame_stride;
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {max(0, -dx),
-                           max(0, -dy),
-                           task->reconstruction_state.source_w - max(0, dx),
-                           task->reconstruction_state.source_h - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)color_ptr,
-                                          (float *)color_variance_ptr,
-                                          (float *)scale_ptr,
-                                          difference,
-                                          local_rect,
-                                          task->buffer.stride,
-                                          task->buffer.pass_stride,
-                                          frame_offset,
-                                          1.0f,
-                                          task->nlm_k_2);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_calc_weight_kernel()(
-          blurDifference, difference, local_rect, task->buffer.stride, 4);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_construct_gramian_kernel()(dx,
-                                            dy,
-                                            task->tile_info->frames[frame],
-                                            blurDifference,
-                                            (float *)task->buffer.mem.device_pointer,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            (float *)task->storage.XtWX.device_pointer,
-                                            (float3 *)task->storage.XtWY.device_pointer,
-                                            local_rect,
-                                            &task->reconstruction_state.filter_window.x,
-                                            task->buffer.stride,
-                                            4,
-                                            task->buffer.pass_stride,
-                                            frame_offset,
-                                            task->buffer.use_time);
-    }
-
-    return true;
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_finalize_kernel()(x,
-                                 y,
-                                 y * task->filter_area.z + x,
-                                 (float *)output_ptr,
-                                 (int *)task->storage.rank.device_pointer,
-                                 (float *)task->storage.XtWX.device_pointer,
-                                 (float3 *)task->storage.XtWY.device_pointer,
-                                 &task->reconstruction_state.buffer_params.x,
-                                 task->render_buffer.samples);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = rect.x; x < rect.z; x++) {
-        filter_combine_halves_kernel()(x,
-                                       y,
-                                       (float *)mean_ptr,
-                                       (float *)variance_ptr,
-                                       (float *)a_ptr,
-                                       (float *)b_ptr,
-                                       &rect.x,
-                                       r);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_divide_shadow_kernel()(task->render_buffer.samples,
-                                      task->tile_info,
-                                      x,
-                                      y,
-                                      (float *)a_ptr,
-                                      (float *)b_ptr,
-                                      (float *)sample_variance_ptr,
-                                      (float *)sv_variance_ptr,
-                                      (float *)buffer_variance_ptr,
-                                      &task->rect.x,
-                                      task->render_buffer.pass_stride,
-                                      task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_get_feature_kernel()(task->render_buffer.samples,
-                                    task->tile_info,
-                                    mean_offset,
-                                    variance_offset,
-                                    x,
-                                    y,
-                                    (float *)mean_ptr,
-                                    (float *)variance_ptr,
-                                    scale,
-                                    &task->rect.x,
-                                    task->render_buffer.pass_stride,
-                                    task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_write_feature_kernel()(task->render_buffer.samples,
-                                      x + task->filter_area.x,
-                                      y + task->filter_area.y,
-                                      &task->reconstruction_state.buffer_params.x,
-                                      (float *)from_ptr,
-                                      (float *)buffer_ptr,
-                                      out_offset,
-                                      &task->rect.x);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_detect_outliers_kernel()(x,
-                                        y,
-                                        (float *)image_ptr,
-                                        (float *)variance_ptr,
-                                        (float *)depth_ptr,
-                                        (float *)output_ptr,
-                                        &task->rect.x,
-                                        task->buffer.pass_stride);
-      }
-    }
-    return true;
-  }
-
-  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
-  {
-    WorkTile wtile;
-    wtile.x = tile.x;
-    wtile.y = tile.y;
-    wtile.w = tile.w;
-    wtile.h = tile.h;
-    wtile.offset = tile.offset;
-    wtile.stride = tile.stride;
-    wtile.buffer = (float *)tile.buffer;
-
-    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
-     * for combined CPU + GPU rendering we match the GPU and do it per tile
-     * after a given number of sample steps. */
-    if (!kernel_data.integrator.adaptive_stop_per_sample) {
-      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-          const int index = wtile.offset + x + y * wtile.stride;
-          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
-          kernel_do_adaptive_stopping(kg, buffer, sample);
-        }
-      }
-    }
-
-    bool any = false;
-    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
-    }
-    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
-    }
-    return (!any);
-  }
-
-  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
-  {
-    float *render_buffer = (float *)tile.buffer;
-    for (int y = tile.y; y < tile.y + tile.h; y++) {
-      for (int x = tile.x; x < tile.x + tile.w; x++) {
-        int index = tile.offset + x + y * tile.stride;
-        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
-        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-          float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
-          if (sample_multiplier != 1.0f) {
-            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-          }
-        }
-        else {
-          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
-        }
-      }
-    }
-  }
-
-  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-  {
-    const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-    scoped_timer timer(&tile.buffers->render_time);
-
-    Coverage coverage(kg, tile);
-    if (use_coverage) {
-      coverage.init_path_trace();
-    }
-
-    float *render_buffer = (float *)tile.buffer;
-    int start_sample = tile.start_sample;
-    int end_sample = tile.start_sample + tile.num_samples;
-
-    /* Needed for Embree. */
-    SIMD_SET_FLUSH_TO_ZERO;
-
-    for (int sample = start_sample; sample < end_sample; sample++) {
-      if (task.get_cancel() || TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-
-      if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-        tile.stealing_state = RenderTile::WAS_STOLEN;
-        break;
-      }
-
-      if (tile.task == RenderTile::PATH_TRACE) {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            if (use_coverage) {
-              coverage.init_pixel(x, y);
-            }
-            path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      else {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      tile.sample = sample + 1;
-
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-        const bool stop = adaptive_sampling_filter(kg, tile, sample);
-        if (stop) {
-          const int num_progress_samples = end_sample - sample;
-          tile.sample = end_sample;
-          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-          break;
-        }
-      }
-
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-    if (use_coverage) {
-      coverage.finalize();
-    }
-
-    if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-      adaptive_sampling_post(tile, kg);
-    }
-  }
-
-  void denoise_openimagedenoise_buffer(DeviceTask &task,
-                                       float *buffer,
-                                       const size_t offset,
-                                       const size_t stride,
-                                       const size_t x,
-                                       const size_t y,
-                                       const size_t w,
-                                       const size_t h,
-                                       const float scale)
-  {
-#ifdef WITH_OPENIMAGEDENOISE
-    assert(openimagedenoise_supported());
-
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
-     * buffers, and for tiled rendering because creating multiple devices and filters
-     * is slow and memory hungry as well.
-     *
-     * TODO: optimize tiled rendering case, by batching together denoising of many
-     * tiles somehow? */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    /* Create device and filter, cached for reuse. */
-    if (!oidn_device) {
-      oidn_device = oidn::newDevice();
-      oidn_device.commit();
-    }
-    if (!oidn_filter) {
-      oidn_filter = oidn_device.newFilter("RT");
-      oidn_filter.set("hdr", true);
-      oidn_filter.set("srgb", false);
-    }
-
-    /* Set images with appropriate stride for our interleaved pass storage. */
-    struct {
-      const char *name;
-      const int offset;
-      const bool scale;
-      const bool use;
-      array<float> scaled_buffer;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
-                  {"albedo",
-                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
-                  {"normal",
-                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
-                  {"output", 0, false, true},
-                  { NULL,
-                    0 }};
-
-    for (int i = 0; passes[i].name; i++) {
-      if (!passes[i].use) {
-        continue;
-      }
-
-      const int64_t pixel_offset = offset + x + y * stride;
-      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
-      const int64_t pixel_stride = task.pass_stride;
-      const int64_t row_stride = stride * pixel_stride;
-
-      if (passes[i].scale && scale != 1.0f) {
-        /* Normalize albedo and normal passes as they are scaled by the number of samples.
-         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
-        array<float> &scaled_buffer = passes[i].scaled_buffer;
-        scaled_buffer.resize(w * h * 3);
-
-        for (int y = 0; y < h; y++) {
-          const float *pass_row = buffer + buffer_offset + y * row_stride;
-          float *scaled_row = scaled_buffer.data() + y * w * 3;
-
-          for (int x = 0; x < w; x++) {
-            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
-            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
-            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
-          }
-        }
-
-        oidn_filter.setImage(
-            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
-      }
-      else {
-        oidn_filter.setImage(passes[i].name,
-                             buffer + buffer_offset,
-                             oidn::Format::Float3,
-                             w,
-                             h,
-                             0,
-                             pixel_stride * sizeof(float),
-                             row_stride * sizeof(float));
-      }
-    }
-
-    /* Execute filter. */
-    oidn_filter.commit();
-    oidn_filter.execute();
-#else
-    (void)task;
-    (void)buffer;
-    (void)offset;
-    (void)stride;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)scale;
-#endif
-  }
-
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
-  {
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      /* Copy pixels from compute device to CPU (no-op for CPU device). */
-      rtile.buffers->buffer.copy_from_device();
-
-      denoise_openimagedenoise_buffer(task,
-                                      (float *)rtile.buffer,
-                                      rtile.offset,
-                                      rtile.stride,
-                                      rtile.x,
-                                      rtile.y,
-                                      rtile.w,
-                                      rtile.h,
-                                      1.0f / rtile.sample);
-
-      /* todo: it may be possible to avoid this copy, but we have to ensure that
-       * when other code copies data from the device it doesn't overwrite the
-       * denoiser buffers. */
-      rtile.buffers->buffer.copy_to_device();
-    }
-    else {
-      /* Per-tile denoising. */
-      rtile.sample = rtile.start_sample + rtile.num_samples;
-      const float scale = 1.0f / rtile.sample;
-      const float invscale = rtile.sample;
-      const size_t pass_stride = task.pass_stride;
-
-      /* Map neighboring tiles into one buffer for denoising. */
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      rtile = center_tile;
-
-      /* Calculate size of the tile to denoise (including overlap). The overlap
-       * size was chosen empirically. OpenImageDenoise specifies an overlap size
-       * of 128 but this is significantly bigger than typical tile size. */
-      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
-      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
-      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
-      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        RenderTile &ntile = neighbors.tiles[i];
-        if (!ntile.buffer) {
-          continue;
-        }
-
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
-            merged_buffer[x] = tile_buffer[x] * scale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      /* Denoise */
-      denoise_openimagedenoise_buffer(
-          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
-      /* Copy back result from merged buffer. */
-      RenderTile &ntile = neighbors.target;
-      if (ntile.buffer) {
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
-            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
-            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
-            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-  }
-
-  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
-  {
-    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
-    tile.sample = tile.start_sample + tile.num_samples;
-
-    denoising.functions.construct_transform = function_bind(
-        &CPUDevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
-    denoising.render_buffer.samples = tile.sample;
-    denoising.buffer.gpu_temporary_mem = false;
-
-    denoising.run_denoising(tile);
-  }
-
-  void thread_render(DeviceTask &task)
-  {
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        return;
-    }
-
-    /* allocate buffer for kernel globals */
-    device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
-        KernelGlobals(thread_kernel_globals_init());
-
-    profiler.add_state(&kg->profiler);
-
-    CPUSplitKernel *split_kernel = NULL;
-    if (use_split_kernel) {
-      split_kernel = new CPUSplitKernel(this);
-      if (!split_kernel->load_kernels(requested_features)) {
-        thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-        kgbuffer.free();
-        delete split_kernel;
-        return;
-      }
-    }
-
-    /* NLM denoiser. */
-    DenoisingTask *denoising = NULL;
-
-    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-     * avoid waiting with mutex locks in the denoiser, we let only a single
-     * thread acquire denoising tiles. */
-    uint tile_types = task.tile_types;
-    bool hold_denoise_lock = false;
-    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      if (!oidn_task_lock.try_lock()) {
-        tile_types &= ~RenderTile::DENOISE;
-        hold_denoise_lock = true;
-      }
-    }
-
-    RenderTile tile;
-    while (task.acquire_tile(this, tile, tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
-        }
-        else {
-          render(task, tile, kg);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, kg);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-          denoise_openimagedenoise(task, tile);
-        }
-        else if (task.denoising.type == DENOISER_NLM) {
-          if (denoising == NULL) {
-            denoising = new DenoisingTask(this, task);
-            denoising->profiler = &kg->profiler;
-          }
-          denoise_nlm(*denoising, tile);
-        }
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    if (hold_denoise_lock) {
-      oidn_task_lock.unlock();
-    }
-
-    profiler.remove_state(&kg->profiler);
-
-    thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-    kg->~KernelGlobals();
-    kgbuffer.free();
-    delete split_kernel;
-    delete denoising;
-  }
-
-  void thread_denoise(DeviceTask &task)
-  {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      denoise_openimagedenoise(task, tile);
-    }
-    else {
-      DenoisingTask denoising(this, task);
-
-      ProfilingState denoising_profiler_state;
-      profiler.add_state(&denoising_profiler_state);
-      denoising.profiler = &denoising_profiler_state;
-
-      denoise_nlm(denoising, tile);
-
-      profiler.remove_state(&denoising_profiler_state);
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-
-  void thread_film_convert(DeviceTask &task)
-  {
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    if (task.rgba_half) {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_half_float_kernel()(&kernel_globals,
-                                         (uchar4 *)task.rgba_half,
-                                         (float *)task.buffer,
-                                         sample_scale,
-                                         x,
-                                         y,
-                                         task.offset,
-                                         task.stride);
-    }
-    else {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_byte_kernel()(&kernel_globals,
-                                   (uchar4 *)task.rgba_byte,
-                                   (float *)task.buffer,
-                                   sample_scale,
-                                   x,
-                                   y,
-                                   task.offset,
-                                   task.stride);
-    }
-  }
-
-  void thread_shader(DeviceTask &task)
-  {
-    KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
-    for (int sample = 0; sample < task.num_samples; sample++) {
-      for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-        shader_kernel()(kg,
-                        (uint4 *)task.shader_input,
-                        (float4 *)task.shader_output,
-                        task.shader_eval_type,
-                        task.shader_filter,
-                        x,
-                        task.offset,
-                        sample);
-
-      if (task.get_cancel() || TaskPool::canceled())
-        break;
-
-      task.update_progress(NULL);
-    }
-
-    thread_kernel_globals_free(kg);
-    delete kg;
-  }
-
-  virtual int get_split_task_count(DeviceTask &task) override
-  {
-    if (task.type == DeviceTask::SHADER)
-      return task.get_subtask_count(info.cpu_threads, 256);
-    else
-      return task.get_subtask_count(info.cpu_threads);
-  }
-
-  virtual void task_add(DeviceTask &task) override
-  {
-    /* Load texture info. */
-    load_texture_info();
-
-    /* split task into smaller ones */
-    list<DeviceTask> tasks;
-
-    if (task.type == DeviceTask::DENOISE_BUFFER &&
-        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      /* Denoise entire buffer at once with OIDN, it has own threading. */
-      tasks.push_back(task);
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      task.split(tasks, info.cpu_threads, 256);
-    }
-    else {
-      task.split(tasks, info.cpu_threads);
-    }
-
-    foreach (DeviceTask &task, tasks) {
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy);
-      });
-    }
-  }
-
-  virtual void task_wait() override
-  {
-    task_pool.wait_work();
-  }
-
-  virtual void task_cancel() override
-  {
-    task_pool.cancel();
-  }
-
- protected:
-  inline KernelGlobals thread_kernel_globals_init()
-  {
-    KernelGlobals kg = kernel_globals;
-    kg.transparent_shadow_intersections = NULL;
-    const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
-                                sizeof(*kg.decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      kg.decoupled_volume_steps[i] = NULL;
-    }
-    kg.decoupled_volume_steps_index = 0;
-    kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
-    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-    return kg;
-  }
-
-  inline void thread_kernel_globals_free(KernelGlobals *kg)
-  {
-    if (kg == NULL) {
-      return;
-    }
-
-    if (kg->transparent_shadow_intersections != NULL) {
-      free(kg->transparent_shadow_intersections);
-    }
-    const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
-                                sizeof(*kg->decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      if (kg->decoupled_volume_steps[i] != NULL) {
-        free(kg->decoupled_volume_steps[i]);
-      }
-    }
-#ifdef WITH_OSL
-    OSLShader::thread_free(kg);
-#endif
-  }
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
-  {
-    requested_features = requested_features_;
-
-    return true;
-  }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
-  CPUDevice *device;
-  void (*func)(KernelGlobals *kg, KernelData *data);
-
-  CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
-  {
-  }
-  ~CPUSplitKernelFunction()
-  {
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim,
-                       device_memory &kernel_globals,
-                       device_memory &data)
-  {
-    if (!func) {
-      return false;
-    }
-
-    KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-    kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-    for (int y = 0; y < dim.global_size[1]; y++) {
-      for (int x = 0; x < dim.global_size[0]; x++) {
-        kg->global_id = make_int2(x, y);
-
-        func(kg, (KernelData *)data.device_pointer);
-      }
-    }
-
-    return true;
-  }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                    RenderTile &rtile,
-                                                    int num_global_elements,
-                                                    device_memory &kernel_globals,
-                                                    device_memory &data,
-                                                    device_memory &split_data,
-                                                    device_memory &ray_state,
-                                                    device_memory &queue_index,
-                                                    device_memory &use_queues_flags,
-                                                    device_memory &work_pool_wgs)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-  for (int y = 0; y < dim.global_size[1]; y++) {
-    for (int x = 0; x < dim.global_size[0]; x++) {
-      kg->global_id = make_int2(x, y);
-
-      device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
-                                 (KernelData *)data.device_pointer,
-                                 (void *)split_data.device_pointer,
-                                 num_global_elements,
-                                 (char *)ray_state.device_pointer,
-                                 rtile.start_sample,
-                                 rtile.start_sample + rtile.num_samples,
-                                 rtile.x,
-                                 rtile.y,
-                                 rtile.w,
-                                 rtile.h,
-                                 rtile.offset,
-                                 rtile.stride,
-                                 (int *)queue_index.device_pointer,
-                                 dim.global_size[0] * dim.global_size[1],
-                                 (char *)use_queues_flags.device_pointer,
-                                 (uint *)work_pool_wgs.device_pointer,
-                                 rtile.num_samples,
-                                 (float *)rtile.buffer);
-    }
-  }
-
-  return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                               const DeviceRequestedFeatures &)
-{
-  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
-  kernel->func = device->split_kernels[kernel_name]();
-  if (!kernel->func) {
-    delete kernel;
-    return NULL;
-  }
-
-  return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
-  return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
-                                              device_memory & /*data*/,
-                                              DeviceTask & /*task*/)
-{
-  return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
-                                           device_memory & /*data*/,
-                                           size_t num_threads)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
-  return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_CPU;
-  info.description = system_cpu_brand_string();
-  info.id = "CPU";
-  info.num = 0;
-  info.has_volume_decoupled = true;
-  info.has_adaptive_stop_per_sample = true;
-  info.has_osl = true;
-  info.has_half_images = true;
-  info.has_nanovdb = true;
-  info.has_profiling = true;
-  info.denoisers = DENOISER_NLM;
-  if (openimagedenoise_supported()) {
-    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
-  }
-
-  devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
-  string capabilities = "";
-  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
-  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
-  capabilities += system_cpu_support_avx() ? "AVX " : "";
-  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
-  if (capabilities[capabilities.size() - 1] == ' ')
-    capabilities.resize(capabilities.size() - 1);
-  return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+  switch (type) {
+    case DENOISER_OPTIX:
+      return "OptiX";
+    case DENOISER_OPENIMAGEDENOISE:
+      return "OpenImageDenoise";
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      return "UNKNOWN";
+  }
+
+  return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+  static NodeEnum type_enum;
+
+  if (type_enum.empty()) {
+    type_enum.insert("optix", DENOISER_OPTIX);
+    type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+  }
+
+  return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+  static NodeEnum prefilter_enum;
+
+  if (prefilter_enum.empty()) {
+    prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+    prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+    prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+  }
+
+  return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+  NodeType *type = NodeType::add("denoise_params", create);
+
+  const NodeEnum *type_enum = get_type_enum();
+  const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+  SOCKET_BOOLEAN(use, "Use", false);
+
+  SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+  SOCKET_INT(start_sample, "Start Sample", 0);
+
+  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+  return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..02ee63fb0ad
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+  /* Best quality of the result without extra processing time, but requires guiding passes to be
+   * noise-free. */
+  DENOISER_PREFILTER_NONE = 1,
+
+  /* Denoise color and guiding passes together.
+   * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+  DENOISER_PREFILTER_FAST = 2,
+
+  /* Prefilter noisy guiding passes before denoising color.
+   * Improves quality when guiding passes are noisy using extra processing time. */
+  DENOISER_PREFILTER_ACCURATE = 3,
+
+  DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+  NODE_DECLARE
+
+  /* Apply denoiser to image. */
+  bool use = false;
+
+  /* Denoiser type. */
+  DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+  /* Viewport start sample. */
+  int start_sample = 0;
+
+  /* Auxiliry passes. */
+  bool use_pass_albedo = true;
+  bool use_pass_normal = true;
+
+  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_prefilter_enum();
+
+  DenoiseParams();
+
+  bool modified(const DenoiseParams &other) const
+  {
+    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+             use_pass_albedo == other.use_pass_albedo &&
+             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+  }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+  DenoiseParams params;
+
+  int num_samples;
+
+  RenderBuffers *render_buffers;
+  BufferParams buffer_params;
+
+  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+   * tracer) point of view. */
+  bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-    : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
-      profiler(NULL),
-      storage(device),
-      buffer(device),
-      device(device)
-{
-  radius = task.denoising.radius;
-  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
-  if (task.denoising.relative_pca) {
-    pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
-  }
-  else {
-    pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
-  }
-
-  render_buffer.frame_stride = task.frame_stride;
-  render_buffer.pass_stride = task.pass_stride;
-  render_buffer.offset = task.pass_denoising_data;
-
-  target_buffer.pass_stride = task.target_pass_stride;
-  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
-  target_buffer.offset = 0;
-
-  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
-  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
-  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
-  tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
-  tile_info->frames[0] = 0;
-  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
-  for (int i = 1; i < tile_info->num_frames; i++) {
-    tile_info->frames[i] = task.denoising_frames[i - 1];
-  }
-
-  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
-  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
-  storage.XtWX.free();
-  storage.XtWY.free();
-  storage.transform.free();
-  storage.rank.free();
-  buffer.mem.free();
-  buffer.temporary_mem.free();
-  tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
-  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-    RenderTile &rtile = neighbors.tiles[i];
-    tile_info->offsets[i] = rtile.offset;
-    tile_info->strides[i] = rtile.stride;
-    tile_info->buffers[i] = rtile.buffer;
-  }
-  tile_info->x[0] = neighbors.tiles[3].x;
-  tile_info->x[1] = neighbors.tiles[4].x;
-  tile_info->x[2] = neighbors.tiles[5].x;
-  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-  tile_info->y[0] = neighbors.tiles[1].y;
-  tile_info->y[1] = neighbors.tiles[4].y;
-  tile_info->y[2] = neighbors.tiles[7].y;
-  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
-  target_buffer.offset = neighbors.target.offset;
-  target_buffer.stride = neighbors.target.stride;
-  target_buffer.ptr = neighbors.target.buffer;
-
-  if (do_prefilter && neighbors.target.buffers) {
-    target_buffer.denoising_output_offset =
-        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
-  }
-  else {
-    target_buffer.denoising_output_offset = 0;
-  }
-
-  tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
-  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
-   * tiles */
-  rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
-  rect = rect_expand(rect, radius);
-  rect = rect_clip(rect,
-                   make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
-  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
-  buffer.passes = buffer.use_intensity ? 15 : 14;
-  buffer.width = rect.z - rect.x;
-  buffer.stride = align_up(buffer.width, 4);
-  buffer.h = rect.w - rect.y;
-  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
-  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
-  buffer.frame_stride = buffer.pass_stride * buffer.passes;
-  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
-  buffer.mem.alloc_to_device(mem_size, false);
-  buffer.use_time = (tile_info->num_frames > 1);
-
-  /* CPUs process shifts sequentially while GPUs process them in parallel. */
-  int num_layers;
-  if (buffer.gpu_temporary_mem) {
-    /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
-    int max_radius = max(radius, 6);
-    int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
-    num_layers = 2 * num_shifts + 1;
-  }
-  else {
-    num_layers = 3;
-  }
-  /* Allocate two layers per shift as well as one for the weight accumulation. */
-  buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
-  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
-   * sample variance and the buffer variance. */
-  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
-  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
-   * sample variance. */
-  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
-  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
-  /* Reuse memory, the previous data isn't needed anymore. */
-  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
-  /* Use the smoothed variance to filter the two shadow half images using each other for weight
-   * calculation. */
-  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
-  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
-  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
-  device_ptr residual_var = *sample_var_var;
-  /* Estimate the residual variance between the two filtered halves. */
-  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
-  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
-  /* Use the residual variance for a second filter pass. */
-  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
-  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
-  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
-  /* Combine the two double-filtered halves to a final shadow feature. */
-  device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
-  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
-  device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
-  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
-  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
-  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
-  for (int pass = 0; pass < 7; pass++) {
-    device_sub_ptr feature_pass(
-        buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
-    /* Get the unfiltered pass and its variance from the RenderBuffers. */
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *unfiltered,
-                          *variance,
-                          1.0f / render_buffer.samples);
-    /* Smooth the pass and store the result in the denoising buffers. */
-    nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
-    functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
-  }
-}
-
-void DenoisingTask::prefilter_color()
-{
-  int mean_from[] = {20, 21, 22};
-  int variance_from[] = {23, 24, 25};
-  int mean_to[] = {8, 9, 10};
-  int variance_to[] = {11, 12, 13};
-  int num_color_passes = 3;
-
-  device_only_memory<float> temporary_color(device, "denoising temporary color");
-  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
-  for (int pass = 0; pass < num_color_passes; pass++) {
-    device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
-    device_sub_ptr color_var_pass(
-        temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *color_pass,
-                          *color_var_pass,
-                          1.0f / render_buffer.samples);
-  }
-
-  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr color_var_pass(
-      buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  functions.detect_outliers(
-      temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
-  if (buffer.use_intensity) {
-    device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-    nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
-    functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
-  }
-}
-
-void DenoisingTask::load_buffer()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  int original_offset = render_buffer.offset;
-
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int i = 0; i < tile_info->num_frames; i++) {
-    for (int pass = 0; pass < num_passes; pass++) {
-      device_sub_ptr to_pass(
-          buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
-      bool is_variance = (pass >= 11) && (pass <= 13);
-      functions.get_feature(
-          pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
-    }
-    render_buffer.offset += render_buffer.frame_stride;
-  }
-
-  render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int pass = 0; pass < num_passes; pass++) {
-    device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
-    int out_offset = pass + target_buffer.denoising_output_offset;
-    functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
-  }
-}
-
-void DenoisingTask::construct_transform()
-{
-  storage.w = filter_area.z;
-  storage.h = filter_area.w;
-
-  storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
-  storage.rank.alloc_to_device(storage.w * storage.h, false);
-
-  functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
-  storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
-  storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
-  storage.XtWX.zero_to_device();
-  storage.XtWY.zero_to_device();
-
-  reconstruction_state.filter_window = rect_from_shape(
-      filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
-  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  reconstruction_state.source_w = rect.z - rect.x;
-  reconstruction_state.source_h = rect.w - rect.y;
-
-  device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
-  for (int f = 0; f < tile_info->num_frames; f++) {
-    device_ptr scale_ptr = 0;
-    device_sub_ptr *scale_sub_ptr = NULL;
-    if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
-      scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-      scale_ptr = **scale_sub_ptr;
-    }
-
-    functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
-    delete scale_sub_ptr;
-  }
-  functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
-  RenderTileNeighbors neighbors(tile);
-  functions.map_neighbor_tiles(neighbors);
-  set_render_buffer(neighbors);
-
-  setup_denoising_buffer();
-
-  if (tile_info->from_render) {
-    prefilter_shadowing();
-    prefilter_features();
-    prefilter_color();
-  }
-  else {
-    load_buffer();
-  }
-
-  if (do_filter) {
-    construct_transform();
-    reconstruct();
-  }
-
-  if (do_prefilter) {
-    write_buffer();
-  }
-
-  functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
-  /* Parameters of the denoising algorithm. */
-  int radius;
-  float nlm_k_2;
-  float pca_threshold;
-
-  /* Parameters of the RenderBuffers. */
-  struct RenderBuffers {
-    int offset;
-    int pass_stride;
-    int frame_stride;
-    int samples;
-  } render_buffer;
-
-  /* Pointer and parameters of the target buffer. */
-  struct TargetBuffer {
-    int offset;
-    int stride;
-    int pass_stride;
-    int denoising_clean_offset;
-    int denoising_output_offset;
-    device_ptr ptr;
-  } target_buffer;
-
-  TileInfo *tile_info;
-  device_vector<int> tile_info_mem;
-
-  ProfilingState *profiler;
-
-  int4 rect;
-  int4 filter_area;
-
-  bool do_prefilter;
-  bool do_filter;
-
-  struct DeviceFunctions {
-    function<bool(
-        device_ptr image_ptr,    /* Contains the values that are smoothed. */
-        device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
-        device_ptr variance_ptr, /* Contains the variance of the guide image. */
-        device_ptr out_ptr       /* The filtered output is written into this image. */
-        )>
-        non_local_means;
-    function<bool(
-        device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
-        accumulate;
-    function<bool(device_ptr output_ptr)> solve;
-    function<bool()> construct_transform;
-
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  int r,
-                  int4 rect)>
-        combine_halves;
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr sample_variance_ptr,
-                  device_ptr sv_variance_ptr,
-                  device_ptr buffer_variance_ptr)>
-        divide_shadow;
-    function<bool(int mean_offset,
-                  int variance_offset,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  float scale)>
-        get_feature;
-    function<bool(device_ptr image_ptr,
-                  device_ptr variance_ptr,
-                  device_ptr depth_ptr,
-                  device_ptr output_ptr)>
-        detect_outliers;
-    function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
-    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
-  } functions;
-
-  /* Stores state of the current Reconstruction operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct ReconstructionState {
-    int4 filter_window;
-    int4 buffer_params;
-
-    int source_w;
-    int source_h;
-  } reconstruction_state;
-
-  /* Stores state of the current NLM operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct NLMState {
-    int r;     /* Search radius of the filter. */
-    int f;     /* Patch size of the filter. */
-    float a;   /* Variance compensation factor in the MSE estimation. */
-    float k_2; /* Squared value of the k parameter of the filter. */
-    bool is_color;
-
-    void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
-    {
-      r = r_;
-      f = f_;
-      a = a_, k_2 = k_2_;
-      is_color = is_color_;
-    }
-  } nlm_state;
-
-  struct Storage {
-    device_only_memory<float> transform;
-    device_only_memory<int> rank;
-    device_only_memory<float> XtWX;
-    device_only_memory<float3> XtWY;
-    int w;
-    int h;
-
-    Storage(Device *device)
-        : transform(device, "denoising transform"),
-          rank(device, "denoising rank"),
-          XtWX(device, "denoising XtWX"),
-          XtWY(device, "denoising XtWY")
-    {
-    }
-  } storage;
-
-  DenoisingTask(Device *device, const DeviceTask &task);
-  ~DenoisingTask();
-
-  void run_denoising(RenderTile &tile);
-
-  struct DenoiseBuffers {
-    int pass_stride;
-    int passes;
-    int stride;
-    int h;
-    int width;
-    int frame_stride;
-    device_only_memory<float> mem;
-    device_only_memory<float> temporary_mem;
-    bool use_time;
-    bool use_intensity;
-
-    bool gpu_temporary_mem;
-
-    DenoiseBuffers(Device *device)
-        : mem(device, "denoising pixel buffer"),
-          temporary_mem(device, "denoising temporary mem", true)
-    {
-    }
-  } buffer;
-
- protected:
-  Device *device;
-
-  void set_render_buffer(RenderTileNeighbors &neighbors);
-  void setup_denoising_buffer();
-  void prefilter_shadowing();
-  void prefilter_features();
-  void prefilter_color();
-  void construct_transform();
-  void reconstruct();
-
-  void load_buffer();
-  void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/device/device_graphics_interop.cpp
index fa210e747c0..a80a236759f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_path_init.h"
+#include "device/device_graphics_interop.h"
 
-#define KERNEL_NAME path_init
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+CCL_NAMESPACE_BEGIN
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+  /* Dimensions of the buffer, in pixels. */
+  int buffer_width = 0;
+  int buffer_height = 0;
+
+  /* OpenGL pixel buffer object. */
+  int opengl_pbo_id = 0;
+
+  /* Clear the entire destination before doing partial write to it. */
+  bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+  DeviceGraphicsInterop() = default;
+  virtual ~DeviceGraphicsInterop() = default;
+
+  /* Update this device-side graphics interoperability object with the given destination resource
+   * information. */
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+  virtual device_ptr map() = 0;
+  virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+  switch (kernel) {
+    /* Integrator. */
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+      return "integrator_init_from_camera";
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+      return "integrator_init_from_bake";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      return "integrator_intersect_closest";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      return "integrator_intersect_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      return "integrator_intersect_subsurface";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      return "integrator_intersect_volume_stack";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      return "integrator_shade_background";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      return "integrator_shade_light";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      return "integrator_shade_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      return "integrator_shade_surface";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      return "integrator_shade_surface_raytrace";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      return "integrator_shade_volume";
+    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+      return "integrator_megakernel";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+      return "integrator_queued_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+      return "integrator_queued_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+      return "integrator_active_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+      return "integrator_terminated_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      return "integrator_compact_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+      return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+      return "integrator_reset";
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      return "integrator_shadow_catcher_count_possible_splits";
+
+    /* Shader evaluation. */
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      return "shader_eval_displace";
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      return "shader_eval_background";
+
+      /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+    return "film_convert_" #variant_lowercase; \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+    return "film_convert_" #variant_lowercase "_half_rgba";
+
+      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+                                    shadow_catcher_matte_with_shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+    /* Adaptive sampling. */
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+      return "adaptive_sampling_convergence_check";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+      return "adaptive_sampling_filter_x";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+      return "adaptive_sampling_filter_y";
+
+    /* Denoising. */
+    case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+      return "filter_guiding_preprocess";
+    case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+      return "filter_guiding_set_fake_albedo";
+    case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+      return "filter_color_preprocess";
+    case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+      return "filter_color_postprocess";
+
+    /* Cryptomatte. */
+    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+      return "cryptomatte_postprocess";
+
+    /* Generic */
+    case DEVICE_KERNEL_PREFIX_SUM:
+      return "prefix_sum";
+
+    case DEVICE_KERNEL_NUM:
+      break;
+  };
+  LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+  os << device_kernel_as_string(kernel);
+  return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+  string str;
+
+  for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+    if (mask & (uint64_t(1) << i)) {
+      if (!str.empty()) {
+        str += " ";
+      }
+      str += device_kernel_as_string((DeviceKernel)i);
+    }
+  }
+
+  return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/device/device_kernel.h
index 9e1e57beba6..83d959ca87b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/device/device_kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,20 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#pragma once
 
-#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
-#define LOCALS_TYPE BackgroundAOLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "kernel/kernel_types.h"
 
+#include "util/util_string.h"
+
+#include <ostream>  // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
     : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements),
+      data_elements(device_type_traits<uchar>::num_elements_cpu),
       data_size(0),
       device_size(0),
       data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
   }
 }
 
+bool device_memory::device_is_cpu()
+{
+  return (device->info.type == DEVICE_CPU);
+}
+
 void device_memory::swap_device(Device *new_device,
                                 size_t new_device_size,
                                 device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
   MEM_DEVICE_ONLY,
   MEM_GLOBAL,
   MEM_TEXTURE,
-  MEM_PIXELS
 };
 
 /* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
   TYPE_UINT64,
 };
 
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
 {
   switch (datatype) {
     case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements = sizeof(T);
+  static const int num_elements_cpu = sizeof(T);
+  static const int num_elements_gpu = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 /* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
   void device_copy_from(int y, int w, int h, int elem);
   void device_zero();
 
+  bool device_is_cpu();
+
   device_ptr original_device_ptr;
   size_t original_device_size;
   Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
       : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_type_traits<T>::num_elements, 1);
+    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+                                          device_type_traits<T>::num_elements_gpu,
+                        1);
   }
 
   device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
 
 template<typename T> class device_vector : public device_memory {
  public:
+  /* Can only use this for types that have the same size on CPU and GPU. */
+  static_assert(device_type_traits<T>::num_elements_cpu ==
+                device_type_traits<T>::num_elements_gpu);
+
   device_vector(Device *device, const char *name, MemoryType type)
       : device_memory(device, name, type)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements;
+    data_elements = device_type_traits<T>::num_elements_cpu;
     modified = true;
     need_realloc_ = true;
 
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
     return (T *)host_pointer;
   }
 
+  const T *data() const
+  {
+    return (T *)host_pointer;
+  }
+
   T &operator[](size_t i)
   {
     assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
 
   void copy_from_device()
   {
-    device_copy_from(0, data_width, data_height, sizeof(T));
+    device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
   }
 
   void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
   }
 };
 
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
-  device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
-  {
-  }
-
-  void alloc_to_device(size_t width, size_t height, size_t depth = 0)
-  {
-    device_vector<T>::alloc(width, height, depth);
-
-    if (!device_memory::device_pointer) {
-      device_memory::device_alloc();
-    }
-  }
-
-  T *copy_from_device(int y, int w, int h)
-  {
-    device_memory::device_copy_from(y, w, h, sizeof(T));
-    return device_vector<T>::data();
-  }
-};
-
 /* Device Sub Memory
  *
  * Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
-  struct SubDevice {
-    Stats stats;
-    Device *device;
-    map<device_ptr, device_ptr> ptr_map;
-    int peer_island_index = -1;
-  };
-
-  list<SubDevice> devices, denoising_devices;
-  device_ptr unique_key;
-  vector<vector<SubDevice *>> peer_islands;
-  bool use_denoising;
-  bool matching_rendering_and_denoising_devices;
-
-  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        unique_key(1),
-        use_denoising(!info.denoising_devices.empty())
-  {
-    foreach (DeviceInfo &subinfo, info.multi_devices) {
-      /* Always add CPU devices at the back since GPU devices can change
-       * host memory pointers, which CPU uses as device pointer. */
-      SubDevice *sub;
-      if (subinfo.type == DEVICE_CPU) {
-        devices.emplace_back();
-        sub = &devices.back();
-      }
-      else {
-        devices.emplace_front();
-        sub = &devices.front();
-      }
-
-      /* The pointer to 'sub->stats' will stay valid even after new devices
-       * are added, since 'devices' is a linked list. */
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      denoising_devices.emplace_front();
-      SubDevice *sub = &denoising_devices.front();
-
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    /* Build a list of peer islands for the available render devices */
-    foreach (SubDevice &sub, devices) {
-      /* First ensure that every device is in at least once peer island */
-      if (sub.peer_island_index < 0) {
-        peer_islands.emplace_back();
-        sub.peer_island_index = (int)peer_islands.size() - 1;
-        peer_islands[sub.peer_island_index].push_back(&sub);
-      }
-
-      if (!info.has_peer_memory) {
-        continue;
-      }
-
-      /* Second check peer access between devices and fill up the islands accordingly */
-      foreach (SubDevice &peer_sub, devices) {
-        if (peer_sub.peer_island_index < 0 &&
-            peer_sub.device->info.type == sub.device->info.type &&
-            peer_sub.device->check_peer_access(sub.device)) {
-          peer_sub.peer_island_index = sub.peer_island_index;
-          peer_islands[sub.peer_island_index].push_back(&peer_sub);
-        }
-      }
-    }
-
-    /* Try to re-use memory when denoising and render devices use the same physical devices
-     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
-     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
-    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
-                                               (devices.size() == denoising_devices.size());
-    if (matching_rendering_and_denoising_devices) {
-      for (list<SubDevice>::iterator device_it = devices.begin(),
-                                     denoising_device_it = denoising_devices.begin();
-           device_it != devices.end() && denoising_device_it != denoising_devices.end();
-           ++device_it, ++denoising_device_it) {
-        const DeviceInfo &info = device_it->device->info;
-        const DeviceInfo &denoising_info = denoising_device_it->device->info;
-        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
-            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
-            info.num != denoising_info.num) {
-          matching_rendering_and_denoising_devices = false;
-          break;
-        }
-      }
-    }
-
-#ifdef WITH_NETWORK
-    /* try to add network devices */
-    ServerDiscovery discovery(true);
-    time_sleep(1.0);
-
-    vector<string> servers = discovery.get_server_list();
-
-    foreach (string &server, servers) {
-      Device *device = device_network_create(info, stats, profiler, server.c_str());
-      if (device)
-        devices.push_back(SubDevice(device));
-    }
-#endif
-  }
-
-  ~MultiDevice()
-  {
-    foreach (SubDevice &sub, devices)
-      delete sub.device;
-    foreach (SubDevice &sub, denoising_devices)
-      delete sub.device;
-  }
-
-  const string &error_message() override
-  {
-    error_msg.clear();
-
-    foreach (SubDevice &sub, devices)
-      error_msg += sub.device->error_message();
-    foreach (SubDevice &sub, denoising_devices)
-      error_msg += sub.device->error_message();
-
-    return error_msg;
-  }
-
-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
-    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
-    foreach (const SubDevice &sub_device, devices) {
-      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
-      bvh_layout_mask &= device_bvh_layout_mask;
-      bvh_layout_mask_all |= device_bvh_layout_mask;
-    }
-
-    /* With multiple OptiX devices, every device needs its own acceleration structure */
-    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
-      return BVH_LAYOUT_MULTI_OPTIX;
-    }
-
-    /* When devices do not share a common BVH layout, fall back to creating one for each */
-    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
-    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
-      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
-    }
-
-    return bvh_layout_mask;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->load_kernels(requested_features))
-        return false;
-
-    use_denoising = requested_features.use_denoising;
-    if (requested_features.use_denoising) {
-      /* Only need denoising feature, everything else is unused. */
-      DeviceRequestedFeatures denoising_features;
-      denoising_features.use_denoising = true;
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(denoising_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->wait_for_availability(requested_features))
-        return false;
-
-    if (requested_features.use_denoising) {
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->wait_for_availability(requested_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  DeviceKernelStatus get_active_kernel_switch_state() override
-  {
-    DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
-    foreach (SubDevice &sub, devices) {
-      DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
-      switch (subresult) {
-        case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
-        case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
-          return subresult;
-
-        case DEVICE_KERNEL_USING_FEATURE_KERNEL:
-        case DEVICE_KERNEL_UNKNOWN:
-          break;
-      }
-    }
-
-    return result;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    /* Try to build and share a single acceleration structure, if possible */
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
-      devices.back().device->build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
-           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
-    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
-    bvh_multi->sub_bvhs.resize(devices.size());
-
-    vector<BVHMulti *> geom_bvhs;
-    geom_bvhs.reserve(bvh->geometry.size());
-    foreach (Geometry *geom, bvh->geometry) {
-      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
-    }
-
-    /* Broadcast acceleration structure build to all render devices */
-    size_t i = 0;
-    foreach (SubDevice &sub, devices) {
-      /* Change geometry BVH pointers to the sub BVH */
-      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
-      }
-
-      if (!bvh_multi->sub_bvhs[i]) {
-        BVHParams params = bvh->params;
-        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
-          params.bvh_layout = BVH_LAYOUT_OPTIX;
-        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
-          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
-                                                                      BVH_LAYOUT_EMBREE;
-
-        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
-         * (since they are put into the top level directly, see bvh_embree.cpp) */
-        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
-            !bvh->geometry[0]->is_instanced()) {
-          i++;
-          continue;
-        }
-
-        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
-      }
-
-      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
-      i++;
-    }
-
-    /* Change geometry BVH pointers back to the multi BVH. */
-    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-      bvh->geometry[k]->bvh = geom_bvhs[k];
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-    if (devices.size() > 1) {
-      return NULL;
-    }
-    return devices.front().device->osl_memory();
-  }
-
-  bool is_resident(device_ptr key, Device *sub_device) override
-  {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        return find_matching_mem_device(key, sub)->device == sub_device;
-      }
-    }
-    return false;
-  }
-
-  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
-  {
-    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
-    /* Get the memory owner of this key (first try current device, then peer devices) */
-    SubDevice *owner_sub = &sub;
-    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
-      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
-        if (island_sub != owner_sub &&
-            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
-          owner_sub = island_sub;
-        }
-      }
-    }
-    return owner_sub;
-  }
-
-  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
-  {
-    assert(!island.empty());
-
-    /* Get the memory owner of this key or the device with the lowest memory usage when new */
-    SubDevice *owner_sub = island.front();
-    foreach (SubDevice *island_sub, island) {
-      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
-                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
-        owner_sub = island_sub;
-      }
-    }
-    return owner_sub;
-  }
-
-  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
-  {
-    return find_matching_mem_device(key, sub)->ptr_map[key];
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    device_ptr key = unique_key++;
-
-    if (mem.type == MEM_PIXELS) {
-      /* Always allocate pixels memory on all devices
-       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        sub.device->mem_alloc(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
-             mem.type == MEM_DEVICE_ONLY);
-      /* The remaining memory types can be distributed across devices */
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        owner_sub->device->mem_alloc(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* The tile buffers are allocated on each device (see below), so copy to all of them */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_copy_to(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_copy_to(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-
-        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
-          /* Need to create texture objects and update pointer in kernel globals on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_copy_to(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
-      SubDevice *owner_sub = find_matching_mem_device(key, sub);
-      mem.device = owner_sub->device;
-      mem.device_pointer = owner_sub->ptr_map[key];
-
-      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
-      i++;
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* This is a hack to only allocate the tile buffers on denoising devices
-     * Similarly the tile buffers also need to be allocated separately on all devices so any
-     * overlap rendered for denoising does not interfere with each other */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      vector<device_ptr> device_pointers;
-      device_pointers.reserve(devices.size());
-
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_zero(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-
-        device_pointers.push_back(mem.device_pointer);
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map[key] = device_pointers.front();
-          device_pointers.erase(device_pointers.begin());
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-          mem.device_size = existing_size;
-
-          sub.device->mem_zero(mem);
-          sub.ptr_map[key] = mem.device_pointer;
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_zero(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    device_ptr key = mem.device_pointer;
-    size_t existing_size = mem.device_size;
-
-    /* Free memory that was allocated for all devices (see above) on each device */
-    if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = sub.ptr_map[key];
-        mem.device_size = existing_size;
-
-        sub.device->mem_free(mem);
-        sub.ptr_map.erase(sub.ptr_map.find(key));
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map.erase(key);
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = sub.ptr_map[key];
-          mem.device_size = existing_size;
-
-          sub.device->mem_free(mem);
-          sub.ptr_map.erase(sub.ptr_map.find(key));
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
-        mem.device = owner_sub->device;
-        mem.device_pointer = owner_sub->ptr_map[key];
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_free(mem);
-        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
-        if (mem.type == MEM_TEXTURE) {
-          /* Free texture objects on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_free(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-    stats.mem_free(existing_size);
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->const_copy_to(name, host, size);
-  }
-
-  void draw_pixels(device_memory &rgba,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override
-  {
-    assert(rgba.type == MEM_PIXELS);
-
-    device_ptr key = rgba.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-    int sub_height = height / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-      int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
-      int sdy = dy + i * sub_height;
-      /* adjust math for w/width */
-
-      rgba.device_pointer = sub.ptr_map[key];
-      sub.device->draw_pixels(
-          rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
-      i++;
-    }
-
-    rgba.device_pointer = key;
-  }
-
-  void map_tile(Device *sub_device, RenderTile &tile) override
-  {
-    if (!tile.buffer) {
-      return;
-    }
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = find_matching_mem(tile.buffer, sub);
-        return;
-      }
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
-        return;
-      }
-    }
-  }
-
-  int device_number(Device *sub_device) override
-  {
-    int i = 0;
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    return -1;
-  }
-
-  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-      tile.buffer = mem.device_pointer;
-
-      if (mem.device == this && matching_rendering_and_denoising_devices) {
-        /* Skip unnecessary copies in viewport mode (buffer covers the
-         * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tile);
-        continue;
-      }
-
-      /* If the tile was rendered on another device, copy its memory to
-       * to the current device now, for the duration of the denoising task.
-       * Note that this temporarily modifies the RenderBuffers and calls
-       * the device, so this function is not thread safe. */
-      if (mem.device != sub_device) {
-        /* Only copy from device to host once. This is faster, but
-         * also required for the case where a CPU thread is denoising
-         * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tile.buffers->map_neighbor_copied) {
-          tile.buffers->map_neighbor_copied = true;
-          mem.copy_from_device();
-        }
-
-        if (mem.device == this) {
-          /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tile);
-          mem.swap_device(sub_device, mem.device_size, tile.buffer);
-        }
-        else {
-          mem.swap_device(sub_device, 0, 0);
-        }
-
-        mem.copy_to_device();
-
-        tile.buffer = mem.device_pointer;
-        tile.device_size = mem.device_size;
-
-        mem.restore_device();
-      }
-    }
-  }
-
-  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    RenderTile &target_tile = neighbors.target;
-    device_vector<float> &mem = target_tile.buffers->buffer;
-
-    if (mem.device == this && matching_rendering_and_denoising_devices) {
-      return;
-    }
-
-    /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
-    mem.copy_from_device();
-    mem.restore_device();
-
-    /* Copy denoised result to the original device. */
-    mem.copy_to_device();
-
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-
-      if (mem.device != sub_device && mem.device != this) {
-        /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tile.device_size, tile.buffer);
-        sub_device->mem_free(mem);
-        mem.restore_device();
-      }
-    }
-  }
-
-  int get_split_task_count(DeviceTask &task) override
-  {
-    int total_tasks = 0;
-    list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
-    foreach (SubDevice &sub, devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        total_tasks += sub.device->get_split_task_count(subtask);
-      }
-    }
-    return total_tasks;
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    list<SubDevice> task_devices = devices;
-    if (!denoising_devices.empty()) {
-      if (task.type == DeviceTask::DENOISE_BUFFER) {
-        /* Denoising tasks should be redirected to the denoising devices entirely. */
-        task_devices = denoising_devices;
-      }
-      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
-        const uint tile_types = task.tile_types;
-        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
-         * Do not need to split the task here, since they all run through 'acquire_tile'. */
-        task.tile_types = RenderTile::DENOISE;
-        foreach (SubDevice &sub, denoising_devices) {
-          sub.device->task_add(task);
-        }
-        /* Rendering itself should still be executed on the rendering devices. */
-        task.tile_types = tile_types ^ RenderTile::DENOISE;
-      }
-    }
-
-    list<DeviceTask> tasks;
-    task.split(tasks, task_devices.size());
-
-    foreach (SubDevice &sub, task_devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        if (task.buffer)
-          subtask.buffer = find_matching_mem(task.buffer, sub);
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = find_matching_mem(task.shader_input, sub);
-        if (task.shader_output)
-          subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
-        sub.device->task_add(subtask);
-
-        if (task.buffers && task.buffers->buffer.device == this) {
-          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
-          sub.device->task_wait();
-        }
-      }
-    }
-  }
-
-  void task_wait() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_wait();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_wait();
-  }
-
-  void task_cancel() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_cancel();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_cancel();
-  }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
-  for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
-    if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
-      return it;
-  return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
-  boost::asio::io_service io_service;
-  tcp::socket socket;
-  device_ptr mem_counter;
-  DeviceTask the_task; /* todo: handle multiple tasks */
-
-  thread_mutex rpc_lock;
-
-  virtual bool show_samples() const
-  {
-    return false;
-  }
-
-  NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
-      : Device(info, stats, profiler, true), socket(io_service)
-  {
-    error_func = NetworkError();
-    stringstream portstr;
-    portstr << SERVER_PORT;
-
-    tcp::resolver resolver(io_service);
-    tcp::resolver::query query(address, portstr.str());
-    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
-    tcp::resolver::iterator end;
-
-    boost::system::error_code error = boost::asio::error::host_not_found;
-    while (error && endpoint_iterator != end) {
-      socket.close();
-      socket.connect(*endpoint_iterator++, error);
-    }
-
-    if (error)
-      error_func.network_error(error.message());
-
-    mem_counter = 0;
-  }
-
-  ~NetworkDevice()
-  {
-    RPCSend snd(socket, &error_func, "stop");
-    snd.write();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-    }
-
-    thread_scoped_lock lock(rpc_lock);
-
-    mem.device_pointer = ++mem_counter;
-
-    RPCSend snd(socket, &error_func, "mem_alloc");
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_copy_to");
-
-    snd.add(mem);
-    snd.write();
-    snd.write_buffer(mem.host_pointer, mem.memory_size());
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    size_t data_size = mem.memory_size();
-
-    RPCSend snd(socket, &error_func, "mem_copy_from");
-
-    snd.add(mem);
-    snd.add(y);
-    snd.add(w);
-    snd.add(h);
-    snd.add(elem);
-    snd.write();
-
-    RPCReceive rcv(socket, &error_func);
-    rcv.read_buffer(mem.host_pointer, data_size);
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_zero");
-
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      thread_scoped_lock lock(rpc_lock);
-
-      RPCSend snd(socket, &error_func, "mem_free");
-
-      snd.add(mem);
-      snd.write();
-
-      mem.device_pointer = 0;
-    }
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "const_copy_to");
-
-    string name_string(name);
-
-    snd.add(name_string);
-    snd.add(size);
-    snd.write();
-    snd.write_buffer(host, size);
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    if (error_func.have_error())
-      return false;
-
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "load_kernels");
-    snd.add(requested_features.experimental);
-    snd.add(requested_features.max_closure);
-    snd.add(requested_features.max_nodes_group);
-    snd.add(requested_features.nodes_features);
-    snd.write();
-
-    bool result;
-    RPCReceive rcv(socket, &error_func);
-    rcv.read(result);
-
-    return result;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    the_task = task;
-
-    RPCSend snd(socket, &error_func, "task_add");
-    snd.add(task);
-    snd.write();
-  }
-
-  void task_wait()
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "task_wait");
-    snd.write();
-
-    lock.unlock();
-
-    TileList the_tiles;
-
-    /* todo: run this threaded for connecting to multiple clients */
-    for (;;) {
-      if (error_func.have_error())
-        break;
-
-      RenderTile tile;
-
-      lock.lock();
-      RPCReceive rcv(socket, &error_func);
-
-      if (rcv.name == "acquire_tile") {
-        lock.unlock();
-
-        /* todo: watch out for recursive calls! */
-        if (the_task.acquire_tile(this, tile)) { /* write return as bool */
-          the_tiles.push_back(tile);
-
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile");
-          snd.add(tile);
-          snd.write();
-          lock.unlock();
-        }
-        else {
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile_none");
-          snd.write();
-          lock.unlock();
-        }
-      }
-      else if (rcv.name == "release_tile") {
-        rcv.read(tile);
-        lock.unlock();
-
-        TileList::iterator it = tile_list_find(the_tiles, tile);
-        if (it != the_tiles.end()) {
-          tile.buffers = it->buffers;
-          the_tiles.erase(it);
-        }
-
-        assert(tile.buffers != NULL);
-
-        the_task.release_tile(tile);
-
-        lock.lock();
-        RPCSend snd(socket, &error_func, "release_tile");
-        snd.write();
-        lock.unlock();
-      }
-      else if (rcv.name == "task_wait_done") {
-        lock.unlock();
-        break;
-      }
-      else
-        lock.unlock();
-    }
-  }
-
-  void task_cancel()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCSend snd(socket, &error_func, "task_cancel");
-    snd.write();
-  }
-
-  int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
-
- private:
-  NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address)
-{
-  return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_NETWORK;
-  info.description = "Network Device";
-  info.id = "NETWORK";
-  info.num = 0;
-
-  /* todo: get this info from device */
-  info.has_volume_decoupled = false;
-  info.has_adaptive_stop_per_sample = false;
-  info.has_osl = false;
-  info.denoisers = DENOISER_NONE;
-
-  devices.push_back(info);
-}
-
-class DeviceServer {
- public:
-  thread_mutex rpc_lock;
-
-  void network_error(const string &message)
-  {
-    error_func.network_error(message);
-  }
-
-  bool have_error()
-  {
-    return error_func.have_error();
-  }
-
-  DeviceServer(Device *device_, tcp::socket &socket_)
-      : device(device_), socket(socket_), stop(false), blocked_waiting(false)
-  {
-    error_func = NetworkError();
-  }
-
-  void listen()
-  {
-    /* receive remote function calls */
-    for (;;) {
-      listen_step();
-
-      if (stop)
-        break;
-    }
-  }
-
- protected:
-  void listen_step()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCReceive rcv(socket, &error_func);
-
-    if (rcv.name == "stop")
-      stop = true;
-    else
-      process(rcv, lock);
-  }
-
-  /* create a memory buffer for a device buffer and insert it into mem_data */
-  DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
-  {
-    /* create a new DataVector and insert it into mem_data */
-    pair<DataMap::iterator, bool> data_ins = mem_data.insert(
-        DataMap::value_type(client_pointer, DataVector()));
-
-    /* make sure it was a unique insertion */
-    assert(data_ins.second);
-
-    /* get a reference to the inserted vector */
-    DataVector &data_v = data_ins.first->second;
-
-    /* size the vector */
-    data_v.resize(data_size);
-
-    return data_v;
-  }
-
-  DataVector &data_vector_find(device_ptr client_pointer)
-  {
-    DataMap::iterator i = mem_data.find(client_pointer);
-    assert(i != mem_data.end());
-    return i->second;
-  }
-
-  /* setup mapping and reverse mapping of client_pointer<->real_pointer */
-  void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
-  {
-    pair<PtrMap::iterator, bool> mapins;
-
-    /* insert mapping from client pointer to our real device pointer */
-    mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
-    assert(mapins.second);
-
-    /* insert reverse mapping from real our device pointer to client pointer */
-    mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
-    assert(mapins.second);
-  }
-
-  device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-    return i->second;
-  }
-
-  device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-
-    device_ptr result = i->second;
-
-    /* erase the mapping */
-    ptr_map.erase(i);
-
-    /* erase the reverse mapping */
-    PtrMap::iterator irev = ptr_imap.find(result);
-    assert(irev != ptr_imap.end());
-    ptr_imap.erase(irev);
-
-    /* erase the data vector */
-    DataMap::iterator idata = mem_data.find(client_pointer);
-    assert(idata != mem_data.end());
-    mem_data.erase(idata);
-
-    return result;
-  }
-
-  /* note that the lock must be already acquired upon entry.
-   * This is necessary because the caller often peeks at
-   * the header and delegates control to here when it doesn't
-   * specifically handle the current RPC.
-   * The lock must be unlocked before returning */
-  void process(RPCReceive &rcv, thread_scoped_lock &lock)
-  {
-    if (rcv.name == "mem_alloc") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      /* Allocate host side data buffer. */
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      DataVector &data_v = data_vector_insert(client_pointer, data_size);
-      mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
-      /* Perform the allocation on the actual device. */
-      device->mem_alloc(mem);
-
-      /* Store a mapping to/from client_pointer and real device pointer. */
-      pointer_mapping_insert(client_pointer, mem.device_pointer);
-    }
-    else if (rcv.name == "mem_copy_to") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-      }
-
-      /* Copy data from network into memory buffer. */
-      rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
-      /* Copy the data from the memory buffer to the device buffer. */
-      device->mem_copy_to(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_copy_from") {
-      string name;
-      network_device_memory mem(device);
-      int y, w, h, elem;
-
-      rcv.read(mem, name);
-      rcv.read(y);
-      rcv.read(w);
-      rcv.read(h);
-      rcv.read(elem);
-
-      device_ptr client_pointer = mem.device_pointer;
-      mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
-      DataVector &data_v = data_vector_find(client_pointer);
-
-      mem.host_pointer = (device_ptr) & (data_v[0]);
-
-      device->mem_copy_from(mem, y, w, h, elem);
-
-      size_t data_size = mem.memory_size();
-
-      RPCSend snd(socket, &error_func, "mem_copy_from");
-      snd.write();
-      snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
-      lock.unlock();
-    }
-    else if (rcv.name == "mem_zero") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
-      }
-
-      /* Zero memory. */
-      device->mem_zero(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_free") {
-      string name;
-      network_device_memory mem(device);
-
-      rcv.read(mem, name);
-      lock.unlock();
-
-      device_ptr client_pointer = mem.device_pointer;
-
-      mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-      device->mem_free(mem);
-    }
-    else if (rcv.name == "const_copy_to") {
-      string name_string;
-      size_t size;
-
-      rcv.read(name_string);
-      rcv.read(size);
-
-      vector<char> host_vector(size);
-      rcv.read_buffer(&host_vector[0], size);
-      lock.unlock();
-
-      device->const_copy_to(name_string.c_str(), &host_vector[0], size);
-    }
-    else if (rcv.name == "load_kernels") {
-      DeviceRequestedFeatures requested_features;
-      rcv.read(requested_features.experimental);
-      rcv.read(requested_features.max_closure);
-      rcv.read(requested_features.max_nodes_group);
-      rcv.read(requested_features.nodes_features);
-
-      bool result;
-      result = device->load_kernels(requested_features);
-      RPCSend snd(socket, &error_func, "load_kernels");
-      snd.add(result);
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_add") {
-      DeviceTask task;
-
-      rcv.read(task);
-      lock.unlock();
-
-      if (task.buffer)
-        task.buffer = device_ptr_from_client_pointer(task.buffer);
-
-      if (task.rgba_half)
-        task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
-      if (task.rgba_byte)
-        task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
-      if (task.shader_input)
-        task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
-      if (task.shader_output)
-        task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
-      task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
-      task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-      task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
-                                                  this);
-      task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
-      task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
-      device->task_add(task);
-    }
-    else if (rcv.name == "task_wait") {
-      lock.unlock();
-
-      blocked_waiting = true;
-      device->task_wait();
-      blocked_waiting = false;
-
-      lock.lock();
-      RPCSend snd(socket, &error_func, "task_wait_done");
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_cancel") {
-      lock.unlock();
-      device->task_cancel();
-    }
-    else if (rcv.name == "acquire_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      rcv.read(entry.tile);
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "acquire_tile_none") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "release_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else {
-      cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
-      lock.unlock();
-    }
-  }
-
-  bool task_acquire_tile(Device *, RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    bool result = false;
-
-    RPCSend snd(socket, &error_func, "acquire_tile");
-    snd.write();
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "acquire_tile") {
-          tile = entry.tile;
-
-          if (tile.buffer)
-            tile.buffer = ptr_map[tile.buffer];
-
-          result = true;
-          break;
-        }
-        else if (entry.name == "acquire_tile_none") {
-          break;
-        }
-        else {
-          cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop && !have_error());
-
-    return result;
-  }
-
-  void task_update_progress_sample()
-  {
-    ; /* skip */
-  }
-
-  void task_update_tile_sample(RenderTile &)
-  {
-    ; /* skip */
-  }
-
-  void task_release_tile(RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    if (tile.buffer)
-      tile.buffer = ptr_imap[tile.buffer];
-
-    {
-      thread_scoped_lock lock(rpc_lock);
-      RPCSend snd(socket, &error_func, "release_tile");
-      snd.add(tile);
-      snd.write();
-      lock.unlock();
-    }
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "release_tile") {
-          lock.unlock();
-          break;
-        }
-        else {
-          cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop);
-  }
-
-  bool task_get_cancel()
-  {
-    return false;
-  }
-
-  /* properties */
-  Device *device;
-  tcp::socket &socket;
-
-  /* mapping of remote to local pointer */
-  PtrMap ptr_map;
-  PtrMap ptr_imap;
-  DataMap mem_data;
-
-  struct AcquireEntry {
-    string name;
-    RenderTile tile;
-  };
-
-  thread_mutex acquire_mutex;
-  list<AcquireEntry> acquire_queue;
-
-  bool stop;
-  bool blocked_waiting;
-
- private:
-  NetworkError error_func;
-
-  /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
-  try {
-    /* starts thread that responds to discovery requests */
-    ServerDiscovery discovery;
-
-    for (;;) {
-      /* accept connection */
-      boost::asio::io_service io_service;
-      tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
-      tcp::socket socket(io_service);
-      acceptor.accept(socket);
-
-      string remote_address = socket.remote_endpoint().address().to_string();
-      printf("Connected to remote client at: %s\n", remote_address.c_str());
-
-      DeviceServer server(this, socket);
-      server.listen();
-
-      printf("Disconnected.\n");
-    }
-  }
-  catch (exception &e) {
-    fprintf(stderr, "Network server exception: %s\n", e.what());
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-#  include <boost/archive/binary_iarchive.hpp>
-#  include <boost/archive/binary_oarchive.hpp>
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
-#  include <boost/array.hpp>
-#  include <boost/asio.hpp>
-#  include <boost/bind.hpp>
-#  include <boost/serialization/vector.hpp>
-#  include <boost/thread.hpp>
-
-#  include <deque>
-#  include <iostream>
-#  include <sstream>
-
-#  include "render/buffers.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_list.h"
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-#  if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-#  else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-#  endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
-  network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
-  {
-  }
-
-  ~network_device_memory()
-  {
-    device_pointer = 0;
-  };
-
-  vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
-  NetworkError()
-  {
-    error = "";
-    error_count = 0;
-  }
-
-  ~NetworkError()
-  {
-  }
-
-  void network_error(const string &message)
-  {
-    error = message;
-    error_count += 1;
-  }
-
-  bool have_error()
-  {
-    return true ? error_count > 0 : false;
-  }
-
- private:
-  string error;
-  int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
-  RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
-      : name(name_), socket(socket_), archive(archive_stream), sent(false)
-  {
-    archive &name_;
-    error_func = e;
-    fprintf(stderr, "rpc send %s\n", name.c_str());
-  }
-
-  ~RPCSend()
-  {
-  }
-
-  void add(const device_memory &mem)
-  {
-    archive &mem.data_type &mem.data_elements &mem.data_size;
-    archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    archive &mem.type &string(mem.name);
-    archive &mem.interpolation &mem.extension;
-    archive &mem.device_pointer;
-  }
-
-  template<typename T> void add(const T &data)
-  {
-    archive &data;
-  }
-
-  void add(const DeviceTask &task)
-  {
-    int type = (int)task.type;
-    archive &type &task.x &task.y &task.w &task.h;
-    archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    archive &task.offset &task.stride;
-    archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    archive &task.shader_x &task.shader_w;
-    archive &task.need_finish_queue;
-  }
-
-  void add(const RenderTile &tile)
-  {
-    archive &tile.x &tile.y &tile.w &tile.h;
-    archive &tile.start_sample &tile.num_samples &tile.sample;
-    archive &tile.resolution &tile.offset &tile.stride;
-    archive &tile.buffer;
-  }
-
-  void write()
-  {
-    boost::system::error_code error;
-
-    /* get string from stream */
-    string archive_str = archive_stream.str();
-
-    /* first send fixed size header with size of following data */
-    ostringstream header_stream;
-    header_stream << setw(8) << hex << archive_str.size();
-    string header_str = header_stream.str();
-
-    boost::asio::write(
-        socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    /* then send actual data */
-    boost::asio::write(
-        socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    sent = true;
-  }
-
-  void write_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-
-    boost::asio::write(
-        socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-  }
-
- protected:
-  string name;
-  tcp::socket &socket;
-  ostringstream archive_stream;
-  o_archive archive;
-  bool sent;
-  NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
-  RPCReceive(tcp::socket &socket_, NetworkError *e)
-      : socket(socket_), archive_stream(NULL), archive(NULL)
-  {
-    error_func = e;
-    /* read head with fixed size */
-    vector<char> header(8);
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    /* verify if we got something */
-    if (len == header.size()) {
-      /* decode header */
-      string header_str(&header[0], header.size());
-      istringstream header_stream(header_str);
-
-      size_t data_size;
-
-      if ((header_stream >> hex >> data_size)) {
-
-        vector<char> data(data_size);
-        size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
-        if (error.value())
-          error_func->network_error(error.message());
-
-        if (len == data_size) {
-          archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
-          archive_stream = new istringstream(archive_str);
-          archive = new i_archive(*archive_stream);
-
-          *archive &name;
-          fprintf(stderr, "rpc receive %s\n", name.c_str());
-        }
-        else {
-          error_func->network_error("Network receive error: data size doesn't match header");
-        }
-      }
-      else {
-        error_func->network_error("Network receive error: can't decode data size from header");
-      }
-    }
-    else {
-      error_func->network_error("Network receive error: invalid header size");
-    }
-  }
-
-  ~RPCReceive()
-  {
-    delete archive;
-    delete archive_stream;
-  }
-
-  void read(network_device_memory &mem, string &name)
-  {
-    *archive &mem.data_type &mem.data_elements &mem.data_size;
-    *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    *archive &mem.type &name;
-    *archive &mem.interpolation &mem.extension;
-    *archive &mem.device_pointer;
-
-    mem.name = name.c_str();
-    mem.host_pointer = 0;
-
-    /* Can't transfer OpenGL texture over network. */
-    if (mem.type == MEM_PIXELS) {
-      mem.type = MEM_READ_WRITE;
-    }
-  }
-
-  template<typename T> void read(T &data)
-  {
-    *archive &data;
-  }
-
-  void read_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    if (len != size)
-      cout << "Network receive error: buffer size doesn't match expected size\n";
-  }
-
-  void read(DeviceTask &task)
-  {
-    int type;
-
-    *archive &type &task.x &task.y &task.w &task.h;
-    *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    *archive &task.offset &task.stride;
-    *archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    *archive &task.shader_x &task.shader_w;
-    *archive &task.need_finish_queue;
-
-    task.type = (DeviceTask::Type)type;
-  }
-
-  void read(RenderTile &tile)
-  {
-    *archive &tile.x &tile.y &tile.w &tile.h;
-    *archive &tile.start_sample &tile.num_samples &tile.sample;
-    *archive &tile.resolution &tile.offset &tile.stride;
-    *archive &tile.buffer;
-
-    tile.buffers = NULL;
-  }
-
-  string name;
-
- protected:
-  tcp::socket &socket;
-  string archive_str;
-  istringstream *archive_stream;
-  i_archive *archive;
-  NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
-  explicit ServerDiscovery(bool discover = false)
-      : listen_socket(io_service), collect_servers(false)
-  {
-    /* setup listen socket */
-    listen_endpoint.address(boost::asio::ip::address_v4::any());
-    listen_endpoint.port(DISCOVER_PORT);
-
-    listen_socket.open(listen_endpoint.protocol());
-
-    boost::asio::socket_base::reuse_address option(true);
-    listen_socket.set_option(option);
-
-    listen_socket.bind(listen_endpoint);
-
-    /* setup receive callback */
-    async_receive();
-
-    /* start server discovery */
-    if (discover) {
-      collect_servers = true;
-      servers.clear();
-
-      broadcast_message(DISCOVER_REQUEST_MSG);
-    }
-
-    /* start thread */
-    work = new boost::asio::io_service::work(io_service);
-    thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
-  }
-
-  ~ServerDiscovery()
-  {
-    io_service.stop();
-    thread->join();
-    delete thread;
-    delete work;
-  }
-
-  vector<string> get_server_list()
-  {
-    vector<string> result;
-
-    mutex.lock();
-    result = vector<string>(servers.begin(), servers.end());
-    mutex.unlock();
-
-    return result;
-  }
-
- private:
-  void handle_receive_from(const boost::system::error_code &error, size_t size)
-  {
-    if (error) {
-      cout << "Server discovery receive error: " << error.message() << "\n";
-      return;
-    }
-
-    if (size > 0) {
-      string msg = string(receive_buffer, size);
-
-      /* handle incoming message */
-      if (collect_servers) {
-        if (msg == DISCOVER_REPLY_MSG) {
-          string address = receive_endpoint.address().to_string();
-
-          mutex.lock();
-
-          /* add address if it's not already in the list */
-          bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
-          if (!found)
-            servers.push_back(address);
-
-          mutex.unlock();
-        }
-      }
-      else {
-        /* reply to request */
-        if (msg == DISCOVER_REQUEST_MSG)
-          broadcast_message(DISCOVER_REPLY_MSG);
-      }
-    }
-
-    async_receive();
-  }
-
-  void async_receive()
-  {
-    listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
-                                     receive_endpoint,
-                                     boost::bind(&ServerDiscovery::handle_receive_from,
-                                                 this,
-                                                 boost::asio::placeholders::error,
-                                                 boost::asio::placeholders::bytes_transferred));
-  }
-
-  void broadcast_message(const string &msg)
-  {
-    /* setup broadcast socket */
-    boost::asio::ip::udp::socket socket(io_service);
-
-    socket.open(boost::asio::ip::udp::v4());
-
-    boost::asio::socket_base::broadcast option(true);
-    socket.set_option(option);
-
-    boost::asio::ip::udp::endpoint broadcast_endpoint(
-        boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
-    /* broadcast message */
-    socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
-  }
-
-  /* network service and socket */
-  boost::asio::io_service io_service;
-  boost::asio::ip::udp::endpoint listen_endpoint;
-  boost::asio::ip::udp::socket listen_socket;
-
-  /* threading */
-  boost::thread *thread;
-  boost::asio::io_service::work *work;
-  boost::mutex mutex;
-
-  /* buffer and endpoint for receiving messages */
-  char receive_buffer[256];
-  boost::asio::ip::udp::endpoint receive_endpoint;
-
-  // os, version, devices, status, host name, group name, ip as far as fields go
-  struct ServerInfo {
-    string cycles_version;
-    string os;
-    int device_count;
-    string status;
-    string host_name;
-    string group_name;
-    string host_addr;
-  };
-
-  /* collection of server addresses in list */
-  bool collect_servers;
-  vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/device.h"
-#  include "device/device_intern.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_set.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
-  static bool initialized = false;
-  static bool result = false;
-
-  if (initialized)
-    return result;
-
-  initialized = true;
-
-  if (OpenCLInfo::device_type() != 0) {
-    int clew_result = clewInit();
-    if (clew_result == CLEW_SUCCESS) {
-      VLOG(1) << "CLEW initialization succeeded.";
-      result = true;
-    }
-    else {
-      VLOG(1) << "CLEW initialization failed: "
-              << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
-                                                              "Error opening the library");
-    }
-  }
-  else {
-    VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
-    result = false;
-  }
-
-  return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-#  ifdef _WIN32
-  __try {
-    return clGetPlatformIDs(0, NULL, num_platforms);
-  }
-  __except (EXCEPTION_EXECUTE_HANDLER) {
-    /* Ignore crashes inside the OpenCL driver and hope we can
-     * survive even with corrupted OpenCL installs. */
-    fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
-  }
-
-  *num_platforms = 0;
-  return CL_DEVICE_NOT_FOUND;
-#  else
-  return clGetPlatformIDs(0, NULL, num_platforms);
-#  endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
-  cl_uint num_platforms = 0;
-  device_opencl_get_num_platforms_safe(&num_platforms);
-  if (num_platforms == 0) {
-    return;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  /* Devices are numbered consecutively across platforms. */
-  int num_devices = 0;
-  set<string> unique_ids;
-  foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
-    /* Compute unique ID for persistent user preferences. */
-    const string &platform_name = platform_device.platform_name;
-    const string &device_name = platform_device.device_name;
-    string hardware_id = platform_device.hardware_id;
-    if (hardware_id == "") {
-      hardware_id = string_printf("ID_%d", num_devices);
-    }
-    string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
-    /* Hardware ID might not be unique, add device number in that case. */
-    if (unique_ids.find(id) != unique_ids.end()) {
-      id += string_printf("_ID_%d", num_devices);
-    }
-    unique_ids.insert(id);
-
-    /* Create DeviceInfo. */
-    DeviceInfo info;
-    info.type = DEVICE_OPENCL;
-    info.description = string_remove_trademark(string(device_name));
-    info.num = num_devices;
-    /* We don't know if it's used for display, but assume it is. */
-    info.display_device = true;
-    info.use_split_kernel = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
-    info.id = id;
-
-    /* Check OpenCL extensions */
-    info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
-    /* Disabled for now due to apparent AMD driver bug. */
-    info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
-    devices.push_back(info);
-    num_devices++;
-  }
-}
-
-string device_opencl_capabilities()
-{
-  if (OpenCLInfo::device_type() == 0) {
-    return "All OpenCL devices are forced to be OFF";
-  }
-  string result = "";
-  string error_msg = ""; /* Only used by opencl_assert(), but in the future
-                          * it could also be nicely reported to the console.
-                          */
-  cl_uint num_platforms = 0;
-  opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
-  if (num_platforms == 0) {
-    return "No OpenCL platforms found\n";
-  }
-  result += string_printf("Number of platforms: %u\n", num_platforms);
-
-  vector<cl_platform_id> platform_ids;
-  platform_ids.resize(num_platforms);
-  opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-#  define APPEND_INFO(func, id, name, what, type) \
-    do { \
-      type data; \
-      memset(&data, 0, sizeof(data)); \
-      opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
-      result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
-    } while (false)
-#  define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
-    do { \
-      string value; \
-      size_t length = 0; \
-      if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
-        vector<char> buffer(length + 1); \
-        if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
-          value = string(buffer.data()); \
-        } \
-      } \
-      if (is_optional && !(length != 0 && value[0] != '\0')) { \
-        break; \
-      } \
-      result += string_printf("%s: %s\n", name, value.c_str()); \
-    } while (false)
-#  define APPEND_PLATFORM_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-#  define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-#  define APPEND_PLATFORM_INFO(id, name, what, type) \
-    APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#  define APPEND_DEVICE_INFO(id, name, what, type) \
-    APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#  define APPEND_DEVICE_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-#  define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
-  vector<cl_device_id> device_ids;
-  for (cl_uint platform = 0; platform < num_platforms; ++platform) {
-    cl_platform_id platform_id = platform_ids[platform];
-
-    result += string_printf("Platform #%u\n", platform);
-
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
-    cl_uint num_devices = 0;
-    opencl_assert(
-        clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
-    result += string_printf("\tNumber of devices: %u\n", num_devices);
-
-    device_ids.resize(num_devices);
-    opencl_assert(clGetDeviceIDs(
-        platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
-    for (cl_uint device = 0; device < num_devices; ++device) {
-      cl_device_id device_id = device_ids[device];
-
-      result += string_printf("\t\tDevice: #%u\n", device);
-
-      APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
-      APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
-      APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
-      APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
-      APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
-      APPEND_DEVICE_INFO(
-          device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
-    }
-  }
-
-#  undef APPEND_INFO
-#  undef APPEND_STRING_INFO_IMPL
-#  undef APPEND_PLATFORM_STRING_INFO
-#  undef APPEND_STRING_EXTENSION_INFO
-#  undef APPEND_PLATFORM_INFO
-#  undef APPEND_DEVICE_INFO
-#  undef APPEND_DEVICE_STRING_INFO
-#  undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
-  return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-#  include "bvh/bvh.h"
-#  include "bvh/bvh_optix.h"
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_denoising.h"
-#  include "device/device_intern.h"
-#  include "render/buffers.h"
-#  include "render/hair.h"
-#  include "render/mesh.h"
-#  include "render/object.h"
-#  include "render/scene.h"
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_progress.h"
-#  include "util/util_time.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-#    define OPTIX_DONT_INCLUDE_CUDA
-#  endif
-#  include <optix_function_table_definition.h>
-#  include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-#  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-};
-struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-};
-
-#  define check_result_cuda(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_cuda_ret(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define check_result_optix(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_optix_ret(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define launch_filter_kernel(func_name, w, h, args) \
-    { \
-      CUfunction func; \
-      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
-      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
-      int threads; \
-      check_result_cuda_ret( \
-          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-      threads = (int)sqrt((float)threads); \
-      int xblocks = ((w) + threads - 1) / threads; \
-      int yblocks = ((h) + threads - 1) / threads; \
-      check_result_cuda_ret( \
-          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
-    } \
-    (void)0
-
-class OptiXDevice : public CUDADevice {
-
-  // List of OptiX program groups
-  enum {
-    PG_RGEN,
-    PG_MISS,
-    PG_HITD,  // Default hit group
-    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
-#  if OPTIX_ABI_VERSION >= 36
-    PG_HITD_MOTION,
-    PG_HITS_MOTION,
-#  endif
-    PG_BAKE,  // kernel_bake_evaluate
-    PG_DISP,  // kernel_displace_evaluate
-    PG_BACK,  // kernel_background_evaluate
-    PG_CALL,
-    NUM_PROGRAM_GROUPS = PG_CALL + 3
-  };
-
-  // List of OptiX pipelines
-  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
-  // A single shader binding table entry
-  struct SbtRecord {
-    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
-  };
-
-  // Information stored about CUDA memory allocations
-  struct CUDAMem {
-    bool free_map_host = false;
-    CUarray array = NULL;
-    CUtexObject texobject = 0;
-    bool use_mapped_host = false;
-  };
-
-  // Helper class to manage current CUDA context
-  struct CUDAContextScope {
-    CUDAContextScope(CUcontext ctx)
-    {
-      cuCtxPushCurrent(ctx);
-    }
-    ~CUDAContextScope()
-    {
-      cuCtxPopCurrent(NULL);
-    }
-  };
-
-  // Use a pool with multiple threads to support launches with multiple CUDA streams
-  TaskPool task_pool;
-
-  vector<CUstream> cuda_stream;
-  OptixDeviceContext context = NULL;
-
-  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
-  OptixModule builtin_modules[2] = {};
-  OptixPipeline pipelines[NUM_PIPELINES] = {};
-
-  bool motion_blur = false;
-  device_vector<SbtRecord> sbt_data;
-  device_only_memory<KernelParams> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
-
-  OptixDenoiser denoiser = NULL;
-  device_only_memory<unsigned char> denoiser_state;
-  int denoiser_input_passes = 0;
-
-  vector<device_only_memory<char>> delayed_free_bvh_memory;
-  thread_mutex delayed_free_bvh_mutex;
-
- public:
-  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : CUDADevice(info_, stats_, profiler_, background_),
-        sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params", false),
-        denoiser_state(this, "__denoiser_state", true)
-  {
-    // Store number of CUDA streams in device info
-    info.cpu_threads = DebugFlags().optix.cuda_streams;
-
-    // Make the CUDA context current
-    if (!cuContext) {
-      return;  // Do not initialize if CUDA context creation failed already
-    }
-    const CUDAContextScope scope(cuContext);
-
-    // Create OptiX context for this device
-    OptixDeviceContextOptions options = {};
-#  ifdef WITH_CYCLES_LOGGING
-    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
-    options.logCallbackFunction =
-        [](unsigned int level, const char *, const char *message, void *) {
-          switch (level) {
-            case 1:
-              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
-              break;
-            case 2:
-              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
-              break;
-            case 3:
-              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
-              break;
-            case 4:
-              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
-              break;
-          }
-        };
-#  endif
-    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-#  ifdef WITH_CYCLES_LOGGING
-    check_result_optix(optixDeviceContextSetLogCallback(
-        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-#  endif
-
-    // Create launch streams
-    cuda_stream.resize(info.cpu_threads);
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
-    // Fix weird compiler bug that assigns wrong size
-    launch_params.data_elements = sizeof(KernelParams);
-    // Allocate launch parameter buffer memory on device
-    launch_params.alloc_to_device(info.cpu_threads);
-  }
-  ~OptiXDevice()
-  {
-    // Stop processing any more tasks
-    task_pool.cancel();
-
-    // Make CUDA context current
-    const CUDAContextScope scope(cuContext);
-
-    free_bvh_memory_delayed();
-
-    sbt_data.free();
-    texture_info.free();
-    launch_params.free();
-    denoiser_state.free();
-
-    // Unload modules
-    if (optix_module != NULL)
-      optixModuleDestroy(optix_module);
-    for (unsigned int i = 0; i < 2; ++i)
-      if (builtin_modules[i] != NULL)
-        optixModuleDestroy(builtin_modules[i]);
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
-      if (pipelines[i] != NULL)
-        optixPipelineDestroy(pipelines[i]);
-
-    // Destroy launch streams
-    for (CUstream stream : cuda_stream)
-      cuStreamDestroy(stream);
-
-    if (denoiser != NULL)
-      optixDenoiserDestroy(denoiser);
-
-    optixDeviceContextDestroy(context);
-  }
-
- private:
-  bool show_samples() const override
-  {
-    // Only show samples if not rendering multiple tiles in parallel
-    return info.cpu_threads == 1;
-  }
-
-  BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
-    if (optix_module == NULL)
-      return CUDADevice::get_bvh_layout_mask();
-
-    // OptiX has its own internal acceleration structure format
-    return BVH_LAYOUT_OPTIX;
-  }
-
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter,
-                                          bool /*split*/) override
-  {
-    // Split kernel is not supported in OptiX
-    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
-        requested_features, filter, false);
-
-    // Add OptiX SDK include directory to include paths
-    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
-    if (optix_sdk_path) {
-      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
-    }
-
-    // Specialization for shader raytracing
-    if (requested_features.use_shader_raytrace) {
-      common_cflags += " --keep-device-functions";
-    }
-    else {
-      common_cflags += " -D __NO_SHADER_RAYTRACE__";
-    }
-
-    return common_cflags;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    if (have_error()) {
-      // Abort early if context creation failed already
-      return false;
-    }
-
-    // Load CUDA modules because we need some of the utility kernels
-    if (!CUDADevice::load_kernels(requested_features)) {
-      return false;
-    }
-
-    // Baking is currently performed using CUDA, so no need to load OptiX kernels
-    if (requested_features.use_baking) {
-      return true;
-    }
-
-    const CUDAContextScope scope(cuContext);
-
-    // Unload existing OptiX module and pipelines first
-    if (optix_module != NULL) {
-      optixModuleDestroy(optix_module);
-      optix_module = NULL;
-    }
-    for (unsigned int i = 0; i < 2; ++i) {
-      if (builtin_modules[i] != NULL) {
-        optixModuleDestroy(builtin_modules[i]);
-        builtin_modules[i] = NULL;
-      }
-    }
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
-      if (pipelines[i] != NULL) {
-        optixPipelineDestroy(pipelines[i]);
-        pipelines[i] = NULL;
-      }
-    }
-
-    OptixModuleCompileOptions module_options = {};
-    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
-    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-#  if OPTIX_ABI_VERSION >= 41
-    module_options.boundValues = nullptr;
-    module_options.numBoundValues = 0;
-#  endif
-
-    OptixPipelineCompileOptions pipeline_options = {};
-    // Default to no motion blur and two-level graph, since it is the fastest option
-    pipeline_options.usesMotionBlur = false;
-    pipeline_options.traversableGraphFlags =
-        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
-    pipeline_options.numPayloadValues = 6;
-    pipeline_options.numAttributeValues = 2;  // u, v
-    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
-    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
-
-#  if OPTIX_ABI_VERSION >= 36
-    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
-    if (requested_features.use_hair) {
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
-      }
-      else {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
-      }
-    }
-#  endif
-
-    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
-    // This is necessary since objects may be reported to have motion if the Vector pass is
-    // active, but may still need to be rendered without motion blur if that isn't active as well
-    motion_blur = requested_features.use_object_motion;
-
-    if (motion_blur) {
-      pipeline_options.usesMotionBlur = true;
-      // Motion blur can insert motion transforms into the traversal graph
-      // It is no longer a two-level graph then, so need to set flags to allow any configuration
-      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
-    }
-
-    {  // Load and compile PTX module with OptiX kernels
-      string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
-                                                   "lib/kernel_optix_shader_raytrace.ptx" :
-                                                   "lib/kernel_optix.ptx");
-      if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
-        if (!getenv("OPTIX_ROOT_DIR")) {
-          set_error(
-              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
-              "the Optix SDK to be able to compile Optix kernels on demand).");
-          return false;
-        }
-        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
-      }
-      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
-        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
-        return false;
-      }
-
-      check_result_optix_ret(optixModuleCreateFromPTX(context,
-                                                      &module_options,
-                                                      &pipeline_options,
-                                                      ptx_data.data(),
-                                                      ptx_data.size(),
-                                                      nullptr,
-                                                      0,
-                                                      &optix_module));
-    }
-
-    // Create program groups
-    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupOptions group_options = {};  // There are no options currently
-    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    group_descs[PG_RGEN].raygen.module = optix_module;
-    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
-    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
-    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    group_descs[PG_MISS].miss.module = optix_module;
-    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
-    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
-    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
-    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
-    if (requested_features.use_hair) {
-      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
-      // Add curve intersection programs
-      if (requested_features.use_hair_thick) {
-        // Slower programs for thick hair since that also slows down ribbons.
-        // Ideally this should not be needed.
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-      }
-      else {
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-      }
-
-#  if OPTIX_ABI_VERSION >= 36
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        OptixBuiltinISOptions builtin_options = {};
-        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-        builtin_options.usesMotionBlur = false;
-
-        check_result_optix_ret(optixBuiltinISModuleGet(
-            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
-        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
-        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
-        if (motion_blur) {
-          builtin_options.usesMotionBlur = true;
-
-          check_result_optix_ret(optixBuiltinISModuleGet(
-              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
-          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
-          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
-          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
-          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
-        }
-      }
-#  endif
-    }
-
-    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
-      // Add hit group for local intersections
-      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
-      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
-    }
-
-    if (requested_features.use_baking) {
-      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BAKE].raygen.module = optix_module;
-      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
-    }
-
-    if (requested_features.use_true_displacement) {
-      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_DISP].raygen.module = optix_module;
-      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
-    }
-
-    if (requested_features.use_background_light) {
-      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BACK].raygen.module = optix_module;
-      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
-    }
-
-    // Shader raytracing replaces some functions with direct callables
-    if (requested_features.use_shader_raytrace) {
-      group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
-      group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
-          "__direct_callable__kernel_volume_shadow";
-      group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
-          "__direct_callable__subsurface_scatter_multi_setup";
-    }
-
-    check_result_optix_ret(optixProgramGroupCreate(
-        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
-    // Get program stack sizes
-    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
-    // Set up SBT, which in this case is used only to select between different programs
-    sbt_data.alloc(NUM_PROGRAM_GROUPS);
-    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
-      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
-    }
-    sbt_data.copy_to_device();  // Upload SBT to device
-
-    // Calculate maximum trace continuation stack size
-    unsigned int trace_css = stack_size[PG_HITD].cssCH;
-    // This is based on the maximum of closest-hit and any-hit/intersection programs
-    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-#  if OPTIX_ABI_VERSION >= 36
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-#  endif
-
-    OptixPipelineLinkOptions link_options = {};
-    link_options.maxTraceDepth = 1;
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-#  if OPTIX_ABI_VERSION < 24
-    link_options.overrideUsesMotionBlur = motion_blur;
-#  endif
-
-    {  // Create path tracing pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_RGEN]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_PATH_TRACE]));
-
-      // Combine ray generation and trace continuation stack size
-      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
-      // Max direct callable depth is one of the following, so combine accordingly
-      // - __raygen__ -> svm_eval_nodes
-      // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
-      // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      // Set stack size depending on pipeline options
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Only need to create shader evaluation pipeline if one of these features is used:
-    const bool use_shader_eval_pipeline = requested_features.use_baking ||
-                                          requested_features.use_background_light ||
-                                          requested_features.use_true_displacement;
-
-    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_BAKE]);
-      pipeline_groups.push_back(groups[PG_DISP]);
-      pipeline_groups.push_back(groups[PG_BACK]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_SHADER_EVAL]));
-
-      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
-      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
-                                        std::max(stack_size[PG_DISP].cssRG,
-                                                 stack_size[PG_BACK].cssRG)) +
-                               link_options.maxTraceDepth * trace_css;
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Clean up program group objects
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      optixProgramGroupDestroy(groups[i]);
-    }
-
-    return true;
-  }
-
-  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
-  {
-    if (have_error())
-      return;  // Abort early if there was an error previously
-
-    if (task.type == DeviceTask::RENDER) {
-      if (thread_index != 0) {
-        // Only execute denoising in a single thread (see also 'task_add')
-        task.tile_types &= ~RenderTile::DENOISE;
-      }
-
-      RenderTile tile;
-      while (task.acquire_tile(this, tile, task.tile_types)) {
-        if (tile.task == RenderTile::PATH_TRACE)
-          launch_render(task, tile, thread_index);
-        else if (tile.task == RenderTile::BAKE) {
-          // Perform baking using CUDA, since it is not currently implemented in OptiX
-          device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-          CUDADevice::render(task, tile, work_tiles);
-        }
-        else if (tile.task == RenderTile::DENOISE)
-          launch_denoise(task, tile);
-        task.release_tile(tile);
-        if (task.get_cancel() && !task.need_finish_queue)
-          break;  // User requested cancellation
-        else if (have_error())
-          break;  // Abort rendering when encountering an error
-      }
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      // CUDA kernels are used when doing baking
-      if (optix_module == NULL)
-        CUDADevice::shader(task);
-      else
-        launch_shader_eval(task, thread_index);
-    }
-    else if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Set up a single tile that covers the whole task and denoise it
-      RenderTile tile;
-      tile.x = task.x;
-      tile.y = task.y;
-      tile.w = task.w;
-      tile.h = task.h;
-      tile.buffer = task.buffer;
-      tile.num_samples = task.num_samples;
-      tile.start_sample = task.sample;
-      tile.offset = task.offset;
-      tile.stride = task.stride;
-      tile.buffers = task.buffers;
-
-      launch_denoise(task, tile);
-    }
-  }
-
-  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
-  {
-    assert(thread_index < launch_params.data_size);
-
-    // Keep track of total render time of this tile
-    const scoped_timer timer(&rtile.buffers->render_time);
-
-    WorkTile wtile;
-    wtile.x = rtile.x;
-    wtile.y = rtile.y;
-    wtile.w = rtile.w;
-    wtile.h = rtile.h;
-    wtile.offset = rtile.offset;
-    wtile.stride = rtile.stride;
-    wtile.buffer = (float *)rtile.buffer;
-
-    const int end_sample = rtile.start_sample + rtile.num_samples;
-    // Keep this number reasonable to avoid running into TDRs
-    int step_samples = (info.display_device ? 8 : 32);
-
-    // Offset into launch params buffer so that streams use separate data
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    const CUDAContextScope scope(cuContext);
-
-    for (int sample = rtile.start_sample; sample < end_sample;) {
-      // Copy work tile information to device
-      wtile.start_sample = sample;
-      wtile.num_samples = step_samples;
-      if (task.adaptive_sampling.use) {
-        wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-      }
-      wtile.num_samples = min(wtile.num_samples, end_sample - sample);
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      check_result_cuda(
-          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      // Launch the ray generation program
-      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     // Launch with samples close to each other for better locality
-                                     wtile.w * wtile.num_samples,
-                                     wtile.h,
-                                     1));
-
-      // Run the adaptive sampling kernels at selected samples aligned to step samples.
-      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      }
-
-      // Wait for launch to finish
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      // Update current sample, so it is displayed correctly
-      sample += wtile.num_samples;
-      rtile.sample = sample;
-      // Update task progress after the kernel completed rendering
-      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
-      if (task.get_cancel() && !task.need_finish_queue)
-        return;  // Cancel rendering
-    }
-
-    // Finalize adaptive sampling
-    if (task.adaptive_sampling.use) {
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
-    }
-  }
-
-  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
-  {
-    // Update current sample (for display and NLM denoising task)
-    rtile.sample = rtile.start_sample + rtile.num_samples;
-
-    // Make CUDA context current now, since it is used for both denoising tasks
-    const CUDAContextScope scope(cuContext);
-
-    // Choose between OptiX and NLM denoising
-    if (task.denoising.type == DENOISER_OPTIX) {
-      // Map neighboring tiles onto this device, indices are as following:
-      // Where index 4 is the center tile and index 9 is the target for the result.
-      //   0 1 2
-      //   3 4 5
-      //   6 7 8  9
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      RenderTile &target_tile = neighbors.target;
-      rtile = center_tile;  // Tile may have been modified by mapping code
-
-      // Calculate size of the tile to denoise (including overlap)
-      int4 rect = center_tile.bounds();
-      // Overlap between tiles has to be at least 64 pixels
-      // TODO(pmours): Query this value from OptiX
-      rect = rect_expand(rect, 64);
-      int4 clip_rect = neighbors.bounds();
-      rect = rect_clip(rect, clip_rect);
-      int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-      int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
-      // Calculate byte offsets and strides
-      int pixel_stride = task.pass_stride * (int)sizeof(float);
-      int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
-      const int pass_offset[3] = {
-          (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
-      // Start with the current tile pointer offset
-      int input_stride = pixel_stride;
-      device_ptr input_ptr = rtile.buffer + pixel_offset;
-
-      // Copy tile data into a common buffer if necessary
-      device_only_memory<float> input(this, "denoiser input", true);
-      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
-      bool contiguous_memory = true;
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
-          contiguous_memory = false;
-        }
-      }
-
-      if (contiguous_memory) {
-        // Tiles are in continous memory, so can just subtract overlap offset
-        input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
-        // Stride covers the whole width of the image and not just a single tile
-        input_stride *= rtile.stride;
-      }
-      else {
-        // Adjacent tiles are in separate memory regions, so need to copy them into a single one
-        input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
-        // Start with the new input buffer
-        input_ptr = input.device_pointer;
-        // Stride covers the width of the new input buffer, which includes tile width and overlap
-        input_stride *= rect_size.x;
-
-        TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-          tile_info->offsets[i] = neighbors.tiles[i].offset;
-          tile_info->strides[i] = neighbors.tiles[i].stride;
-          tile_info->buffers[i] = neighbors.tiles[i].buffer;
-        }
-        tile_info->x[0] = neighbors.tiles[3].x;
-        tile_info->x[1] = neighbors.tiles[4].x;
-        tile_info->x[2] = neighbors.tiles[5].x;
-        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-        tile_info->y[0] = neighbors.tiles[1].y;
-        tile_info->y[1] = neighbors.tiles[4].y;
-        tile_info->y[2] = neighbors.tiles[7].y;
-        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-        tile_info_mem.copy_to_device();
-
-        void *args[] = {
-            &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
-        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
-      void *input_args[] = {&input_rgb.device_pointer,
-                            &input_ptr,
-                            &rect_size.x,
-                            &rect_size.y,
-                            &input_stride,
-                            &task.pass_stride,
-                            const_cast<int *>(pass_offset),
-                            &task.denoising.input_passes,
-                            &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
-      input_ptr = input_rgb.device_pointer;
-      pixel_stride = 3 * sizeof(float);
-      input_stride = rect_size.x * pixel_stride;
-#  endif
-
-      const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.input_passes != denoiser_input_passes);
-      if (recreate_denoiser) {
-        // Destroy existing handle before creating new one
-        if (denoiser != NULL) {
-          optixDenoiserDestroy(denoiser);
-        }
-
-        // Create OptiX denoiser handle on demand when it is first used
-        OptixDenoiserOptions denoiser_options = {};
-        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-#  if OPTIX_ABI_VERSION >= 47
-        denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
-        denoiser_options.guideNormal = task.denoising.input_passes >= 3;
-        check_result_optix_ret(optixDenoiserCreate(
-            context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-#  else
-        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-#    if OPTIX_ABI_VERSION < 28
-        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-#    endif
-        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
-        check_result_optix_ret(
-            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-#  endif
-
-        // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.input_passes;
-      }
-
-      OptixDenoiserSizes sizes = {};
-      check_result_optix_ret(
-          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-#  if OPTIX_ABI_VERSION < 28
-      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-#  else
-      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-#  endif
-      const size_t scratch_offset = sizes.stateSizeInBytes;
-
-      // Allocate denoiser state if tile size has changed since last setup
-      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
-                                denoiser_state.data_height != rect_size.y)) {
-        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
-        // Initialize denoiser state for the current tile size
-        check_result_optix_ret(optixDenoiserSetup(denoiser,
-                                                  0,
-                                                  rect_size.x,
-                                                  rect_size.y,
-                                                  denoiser_state.device_pointer,
-                                                  scratch_offset,
-                                                  denoiser_state.device_pointer + scratch_offset,
-                                                  scratch_size));
-
-        denoiser_state.data_width = rect_size.x;
-        denoiser_state.data_height = rect_size.y;
-      }
-
-      // Set up input and output layer information
-      OptixImage2D input_layers[3] = {};
-      OptixImage2D output_layers[1] = {};
-
-      for (int i = 0; i < 3; ++i) {
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-        input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-#  else
-        input_layers[i].data = input_ptr + pass_offset[i];
-#  endif
-        input_layers[i].width = rect_size.x;
-        input_layers[i].height = rect_size.y;
-        input_layers[i].rowStrideInBytes = input_stride;
-        input_layers[i].pixelStrideInBytes = pixel_stride;
-        input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      output_layers[0].data = input_ptr;
-      output_layers[0].width = rect_size.x;
-      output_layers[0].height = rect_size.y;
-      output_layers[0].rowStrideInBytes = input_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-      int2 output_offset = overlap_offset;
-      overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
-#  else
-      output_layers[0].data = target_tile.buffer + pixel_offset;
-      output_layers[0].width = target_tile.w;
-      output_layers[0].height = target_tile.h;
-      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-#  endif
-      output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-#  if OPTIX_ABI_VERSION >= 47
-      OptixDenoiserLayer image_layers = {};
-      image_layers.input = input_layers[0];
-      image_layers.output = output_layers[0];
-
-      OptixDenoiserGuideLayer guide_layers = {};
-      guide_layers.albedo = input_layers[1];
-      guide_layers.normal = input_layers[2];
-#  endif
-
-      // Finally run denonising
-      OptixDenoiserParams params = {};  // All parameters are disabled/zero
-#  if OPTIX_ABI_VERSION >= 47
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 &guide_layers,
-                                                 &image_layers,
-                                                 1,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  else
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 input_layers,
-                                                 task.denoising.input_passes,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 output_layers,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  endif
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      void *output_args[] = {&input_ptr,
-                             &target_tile.buffer,
-                             &output_offset.x,
-                             &output_offset.y,
-                             &rect_size.x,
-                             &rect_size.y,
-                             &target_tile.x,
-                             &target_tile.y,
-                             &target_tile.w,
-                             &target_tile.h,
-                             &target_tile.offset,
-                             &target_tile.stride,
-                             &task.pass_stride,
-                             &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-#  endif
-
-      check_result_cuda_ret(cuStreamSynchronize(0));
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-    else {
-      // Run CUDA denoising kernels
-      DenoisingTask denoising(this, task);
-      CUDADevice::denoise(rtile, denoising);
-    }
-
-    // Update task progress after the denoiser completed processing
-    task.update_progress(&rtile, rtile.w * rtile.h);
-
-    return true;
-  }
-
-  void launch_shader_eval(DeviceTask &task, int thread_index)
-  {
-    unsigned int rgen_index = PG_BACK;
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
-      rgen_index = PG_BAKE;
-    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
-      rgen_index = PG_DISP;
-
-    const CUDAContextScope scope(cuContext);
-
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    for (int sample = 0; sample < task.num_samples; ++sample) {
-      ShaderParams params;
-      params.input = (uint4 *)task.shader_input;
-      params.output = (float4 *)task.shader_output;
-      params.type = task.shader_eval_type;
-      params.filter = task.shader_filter;
-      params.sx = task.shader_x;
-      params.offset = task.offset;
-      params.sample = sample;
-
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
-                                          &params,
-                                          sizeof(params),
-                                          cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     task.shader_w,
-                                     1,
-                                     1));
-
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      task.update_progress(NULL);
-    }
-  }
-
-  bool build_optix_bvh(BVHOptiX *bvh,
-                       OptixBuildOperation operation,
-                       const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps)
-  {
-    /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
-     * from running out of memory (since both original and compacted acceleration structure memory
-     * may be allocated at the same time for the duration of this function). The builds would
-     * otherwise happen on the same CUDA stream anyway. */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    const CUDAContextScope scope(cuContext);
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    // Compute memory usage
-    OptixAccelBufferSizes sizes = {};
-    OptixAccelBuildOptions options = {};
-    options.operation = operation;
-    if (use_fast_trace_bvh) {
-      VLOG(2) << "Using fast to trace OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
-    }
-    else {
-      VLOG(2) << "Using fast to update OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
-    }
-
-    options.motionOptions.numKeys = num_motion_steps;
-    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
-    options.motionOptions.timeBegin = 0.0f;
-    options.motionOptions.timeEnd = 1.0f;
-
-    check_result_optix_ret(
-        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
-    // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
-    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
-    if (!temp_mem.device_pointer)
-      return false;  // Make sure temporary memory allocation succeeded
-
-    // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
-    device_only_memory<char> &out_data = bvh->as_data;
-    if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      assert(out_data.device == this);
-      out_data.alloc_to_device(sizes.outputSizeInBytes);
-      if (!out_data.device_pointer)
-        return false;
-    }
-    else {
-      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
-    }
-
-    // Finally build the acceleration structure
-    OptixAccelEmitDesc compacted_size_prop = {};
-    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
-    // A tiny space was allocated for this property at the end of the temporary buffer above
-    // Make sure this pointer is 8-byte aligned
-    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
-    OptixTraversableHandle out_handle = 0;
-    check_result_optix_ret(optixAccelBuild(context,
-                                           NULL,
-                                           &options,
-                                           &build_input,
-                                           1,
-                                           temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
-                                           sizes.outputSizeInBytes,
-                                           &out_handle,
-                                           use_fast_trace_bvh ? &compacted_size_prop : NULL,
-                                           use_fast_trace_bvh ? 1 : 0));
-    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-    // Wait for all operations to finish
-    check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-    // Compact acceleration structure to save memory (only if using fast trace as the
-    // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
-    if (use_fast_trace_bvh) {
-      uint64_t compacted_size = sizes.outputSizeInBytes;
-      check_result_cuda_ret(
-          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
-      // Temporary memory is no longer needed, so free it now to make space
-      temp_mem.free();
-
-      // There is no point compacting if the size does not change
-      if (compacted_size < sizes.outputSizeInBytes) {
-        device_only_memory<char> compacted_data(this, "optix compacted as", false);
-        compacted_data.alloc_to_device(compacted_size);
-        if (!compacted_data.device_pointer)
-          // Do not compact if memory allocation for compacted acceleration structure fails
-          // Can just use the uncompacted one then, so succeed here regardless
-          return true;
-
-        check_result_optix_ret(optixAccelCompact(context,
-                                                 NULL,
-                                                 out_handle,
-                                                 compacted_data.device_pointer,
-                                                 compacted_size,
-                                                 &out_handle));
-        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-        // Wait for compaction to finish
-        check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-        std::swap(out_data.device_size, compacted_data.device_size);
-        std::swap(out_data.device_pointer, compacted_data.device_pointer);
-        // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
-      }
-    }
-
-    return true;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
-      /* For baking CUDA is used, build appropriate BVH for that. */
-      Device::build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    free_bvh_memory_delayed();
-
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    progress.set_substatus("Building OptiX acceleration structure");
-
-    if (!bvh->params.top_level) {
-      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
-      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
-      /* Refit is only possible when using fast to trace BVH (because AS is built with
-       * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
-      if (refit && !use_fast_trace_bvh) {
-        assert(bvh_optix->traversable_handle != 0);
-        operation = OPTIX_BUILD_OPERATION_UPDATE;
-      }
-      else {
-        bvh_optix->as_data.free();
-        bvh_optix->traversable_handle = 0;
-      }
-
-      // Build bottom level acceleration structures (BLAS)
-      Geometry *const geom = bvh->geometry[0];
-      if (geom->geometry_type == Geometry::HAIR) {
-        // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(geom);
-        if (hair->num_curves() == 0) {
-          return;
-        }
-
-        const size_t num_segments = hair->num_segments();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = hair->get_motion_steps();
-        }
-
-        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-#  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        // Four control points for each curve segment
-        const size_t num_vertices = num_segments * 4;
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          index_data.alloc(num_segments);
-          vertex_data.alloc(num_vertices * num_motion_steps);
-        }
-        else
-#  endif
-          aabb_data.alloc(num_segments * num_motion_steps);
-
-        // Get AABBs for each motion step
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          // The center step for motion vertices is not stored in the attribute
-          const float3 *keys = hair->get_curve_keys().data();
-          size_t center_step = (num_motion_steps - 1) / 2;
-          if (step != center_step) {
-            size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
-            keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
-          }
-
-          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
-            const Hair::Curve curve = hair->get_curve(j);
-#  if OPTIX_ABI_VERSION >= 36
-            const array<float> &curve_radius = hair->get_curve_radius();
-#  endif
-
-            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-#  if OPTIX_ABI_VERSION >= 36
-              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-                int k0 = curve.first_key + segment;
-                int k1 = k0 + 1;
-                int ka = max(k0 - 1, curve.first_key);
-                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
-                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
-                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
-                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
-                const float4 pw = make_float4(
-                    curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
-                // Convert Catmull-Rom data to Bezier spline
-                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
-                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
-                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
-                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
-                index_data[i] = i * 4;
-                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
-                v[0] = make_float4(
-                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
-                v[1] = make_float4(
-                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
-                v[2] = make_float4(
-                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
-                v[3] = make_float4(
-                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
-              }
-              else
-#  endif
-              {
-                BoundBox bounds = BoundBox::empty;
-                curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
-                const size_t index = step * num_segments + i;
-                aabb_data[index].minX = bounds.min.x;
-                aabb_data[index].minY = bounds.min.y;
-                aabb_data[index].minZ = bounds.min.z;
-                aabb_data[index].maxX = bounds.max.x;
-                aabb_data[index].maxY = bounds.max.y;
-                aabb_data[index].maxZ = bounds.max.z;
-              }
-            }
-          }
-        }
-
-        // Upload AABB data to GPU
-        aabb_data.copy_to_device();
-#  if OPTIX_ABI_VERSION >= 36
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-#  endif
-
-        vector<device_ptr> aabb_ptrs;
-        aabb_ptrs.reserve(num_motion_steps);
-#  if OPTIX_ABI_VERSION >= 36
-        vector<device_ptr> width_ptrs;
-        vector<device_ptr> vertex_ptrs;
-        width_ptrs.reserve(num_motion_steps);
-        vertex_ptrs.reserve(num_motion_steps);
-#  endif
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-#  if OPTIX_ABI_VERSION >= 36
-          const device_ptr base_ptr = vertex_data.device_pointer +
-                                      step * num_vertices * sizeof(float4);
-          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
-          vertex_ptrs.push_back(base_ptr);
-#  endif
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-#  if OPTIX_ABI_VERSION >= 36
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
-          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-          build_input.curveArray.numPrimitives = num_segments;
-          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-          build_input.curveArray.numVertices = num_vertices;
-          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
-          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
-          build_input.curveArray.widthStrideInBytes = sizeof(float4);
-          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
-          build_input.curveArray.indexStrideInBytes = sizeof(int);
-          build_input.curveArray.flag = build_flags;
-          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
-        }
-        else
-#  endif
-        {
-          // Disable visibility test any-hit program, since it is already checked during
-          // intersection. Those trace calls that require anyhit can force it with a ray flag.
-          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-#  if OPTIX_ABI_VERSION < 23
-          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.aabbArray.numPrimitives = num_segments;
-          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-          build_input.aabbArray.flags = &build_flags;
-          build_input.aabbArray.numSbtRecords = 1;
-          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  else
-          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.customPrimitiveArray.numPrimitives = num_segments;
-          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
-          build_input.customPrimitiveArray.flags = &build_flags;
-          build_input.customPrimitiveArray.numSbtRecords = 1;
-          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  endif
-        }
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-      else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
-        // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(geom);
-        if (mesh->num_triangles() == 0) {
-          return;
-        }
-
-        const size_t num_verts = mesh->get_verts().size();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = mesh->get_motion_steps();
-        }
-
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        index_data.alloc(mesh->get_triangles().size());
-        memcpy(index_data.data(),
-               mesh->get_triangles().data(),
-               mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        vertex_data.alloc(num_verts * num_motion_steps);
-
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          const float3 *verts = mesh->get_verts().data();
-
-          size_t center_step = (num_motion_steps - 1) / 2;
-          // The center step for motion vertices is not stored in the attribute
-          if (step != center_step) {
-            verts = motion_keys->data_float3() +
-                    (step > center_step ? step - 1 : step) * num_verts;
-          }
-
-          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
-        }
-
-        // Upload triangle data to GPU
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-
-        vector<device_ptr> vertex_ptrs;
-        vertex_ptrs.reserve(num_motion_steps);
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
-        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-        build_input.triangleArray.numVertices = num_verts;
-        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
-        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
-        build_input.triangleArray.indexBuffer = index_data.device_pointer;
-        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
-        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
-        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
-        build_input.triangleArray.flags = &build_flags;
-        // The SBT does not store per primitive data since Cycles already allocates separate
-        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
-        // one and rely on that having the same meaning in this case.
-        build_input.triangleArray.numSbtRecords = 1;
-        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-    }
-    else {
-      unsigned int num_instances = 0;
-      unsigned int max_num_instances = 0xFFFFFFFF;
-
-      bvh_optix->as_data.free();
-      bvh_optix->traversable_handle = 0;
-      bvh_optix->motion_transform_data.free();
-
-      optixDeviceContextGetProperty(context,
-                                    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
-                                    &max_num_instances,
-                                    sizeof(max_num_instances));
-      // Do not count first bit, which is used to distinguish instanced and non-instanced objects
-      max_num_instances >>= 1;
-      if (bvh->objects.size() > max_num_instances) {
-        progress.set_error(
-            "Failed to build OptiX acceleration structure because there are too many instances");
-        return;
-      }
-
-      // Fill instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
-      aabbs.alloc(bvh->objects.size());
-#  endif
-      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
-      instances.alloc(bvh->objects.size());
-
-      // Calculate total motion transform size and allocate memory for them
-      size_t motion_transform_offset = 0;
-      if (motion_blur) {
-        size_t total_motion_transform_size = 0;
-        for (Object *const ob : bvh->objects) {
-          if (ob->is_traceable() && ob->use_motion()) {
-            total_motion_transform_size = align_up(total_motion_transform_size,
-                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-            total_motion_transform_size = total_motion_transform_size +
-                                          sizeof(OptixSRTMotionTransform) +
-                                          motion_keys * sizeof(OptixSRTData);
-          }
-        }
-
-        assert(bvh_optix->motion_transform_data.device == this);
-        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
-      }
-
-      for (Object *ob : bvh->objects) {
-        // Skip non-traceable objects
-        if (!ob->is_traceable())
-          continue;
-
-        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
-        OptixTraversableHandle handle = blas->traversable_handle;
-
-#  if OPTIX_ABI_VERSION < 41
-        OptixAabb &aabb = aabbs[num_instances];
-        aabb.minX = ob->bounds.min.x;
-        aabb.minY = ob->bounds.min.y;
-        aabb.minZ = ob->bounds.min.z;
-        aabb.maxX = ob->bounds.max.x;
-        aabb.maxY = ob->bounds.max.y;
-        aabb.maxZ = ob->bounds.max.z;
-#  endif
-
-        OptixInstance &instance = instances[num_instances++];
-        memset(&instance, 0, sizeof(instance));
-
-        // Clear transform to identity matrix
-        instance.transform[0] = 1.0f;
-        instance.transform[5] = 1.0f;
-        instance.transform[10] = 1.0f;
-
-        // Set user instance ID to object index (but leave low bit blank)
-        instance.instanceId = ob->get_device_index() << 1;
-
-        // Have to have at least one bit in the mask, or else instance would always be culled
-        instance.visibilityMask = 1;
-
-        if (ob->get_geometry()->has_volume) {
-          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-          instance.visibilityMask |= 2;
-        }
-
-        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-          // Same applies to curves (so they can be skipped in local trace calls)
-          instance.visibilityMask |= 4;
-
-#  if OPTIX_ABI_VERSION >= 36
-          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-              DebugFlags().optix.curves_api &&
-              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-            // Select between motion blur and non-motion blur built-in intersection module
-            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-          }
-#  endif
-        }
-
-        // Insert motion traversable if object has motion
-        if (motion_blur && ob->use_motion()) {
-          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                         motion_keys * sizeof(OptixSRTData);
-
-          const CUDAContextScope scope(cuContext);
-
-          motion_transform_offset = align_up(motion_transform_offset,
-                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
-                                             motion_transform_offset;
-          motion_transform_offset += motion_transform_size;
-
-          // Allocate host side memory for motion transform and fill it with transform data
-          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              new uint8_t[motion_transform_size]);
-          motion_transform.child = handle;
-          motion_transform.motionOptions.numKeys = ob->get_motion().size();
-          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-          motion_transform.motionOptions.timeBegin = 0.0f;
-          motion_transform.motionOptions.timeEnd = 1.0f;
-
-          OptixSRTData *const srt_data = motion_transform.srtData;
-          array<DecomposedTransform> decomp(ob->get_motion().size());
-          transform_motion_decompose(
-              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-            // Scale
-            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-            // Shear
-            srt_data[i].a = decomp[i].z.x;  // scale.x.y
-            srt_data[i].b = decomp[i].z.y;  // scale.x.z
-            srt_data[i].c = decomp[i].w.x;  // scale.y.z
-            assert(decomp[i].z.z == 0.0f);  // scale.y.x
-            assert(decomp[i].w.y == 0.0f);  // scale.z.x
-            assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-            // Pivot point
-            srt_data[i].pvx = 0.0f;
-            srt_data[i].pvy = 0.0f;
-            srt_data[i].pvz = 0.0f;
-
-            // Rotation
-            srt_data[i].qx = decomp[i].x.x;
-            srt_data[i].qy = decomp[i].x.y;
-            srt_data[i].qz = decomp[i].x.z;
-            srt_data[i].qw = decomp[i].x.w;
-
-            // Translation
-            srt_data[i].tx = decomp[i].y.x;
-            srt_data[i].ty = decomp[i].y.y;
-            srt_data[i].tz = decomp[i].y.z;
-          }
-
-          // Upload motion transform to GPU
-          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
-          // Disable instance transform if object uses motion transform already
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
-          // Get traversable handle to motion transform
-          optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu,
-                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                                 &instance.traversableHandle);
-        }
-        else {
-          instance.traversableHandle = handle;
-
-          if (ob->get_geometry()->is_instanced()) {
-            // Set transform matrix
-            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
-          }
-          else {
-            // Disable instance transform if geometry already has it applied to vertex data
-            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-            // Non-instanced objects read ID from 'prim_object', so distinguish
-            // them from instanced objects with the low bit set
-            instance.instanceId |= 1;
-          }
-        }
-      }
-
-      // Upload instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      aabbs.resize(num_instances);
-      aabbs.copy_to_device();
-#  endif
-      instances.resize(num_instances);
-      instances.copy_to_device();
-
-      // Build top-level acceleration structure (TLAS)
-      OptixBuildInput build_input = {};
-      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-#  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-      build_input.instanceArray.aabbs = aabbs.device_pointer;
-      build_input.instanceArray.numAabbs = num_instances;
-#  endif
-      build_input.instanceArray.instances = instances.device_pointer;
-      build_input.instanceArray.numInstances = num_instances;
-
-      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
-        progress.set_error("Failed to build OptiX acceleration structure");
-      }
-      tlas_handle = bvh_optix->traversable_handle;
-    }
-  }
-
-  void release_optix_bvh(BVH *bvh) override
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
-     * while GPU is still rendering. */
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
-    bvh_optix->traversable_handle = 0;
-  }
-
-  void free_bvh_memory_delayed()
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    delayed_free_bvh_memory.free_memory();
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
-    //               Could be removed by moving those functions to filter CUDA module.
-    CUDADevice::const_copy_to(name, host, size);
-
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update traversable handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
-      update_launch_params(offsetof(KernelParams, data), host, size);
-      return;
-    }
-
-    // Update data storage pointers in launch parameters
-#  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
-      return; \
-    }
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  }
-
-  void update_launch_params(size_t offset, void *data, size_t data_size)
-  {
-    const CUDAContextScope scope(cuContext);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    // Upload texture information to device if it has changed since last launch
-    load_texture_info();
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      // Execute in main thread because of OpenGL access
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-      return;
-    }
-
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy, 0);
-      });
-      return;
-    }
-
-    // Split task into smaller ones
-    list<DeviceTask> tasks;
-    task.split(tasks, info.cpu_threads);
-
-    // Queue tasks in internal task pool
-    int task_index = 0;
-    for (DeviceTask &task : tasks) {
-      task_pool.push([=] {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        DeviceTask task_copy = task;
-        thread_run(task_copy, task_index);
-      });
-      task_index++;
-    }
-  }
-
-  void task_wait() override
-  {
-    // Wait for all queued tasks to finish
-    task_pool.wait_work();
-  }
-
-  void task_cancel() override
-  {
-    // Cancel any remaining tasks in the internal pool
-    task_pool.cancel();
-  }
-};
-
-bool device_optix_init()
-{
-  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
-    return true;  // Already initialized function table
-
-  // Need to initialize CUDA as well
-  if (!device_cuda_init())
-    return false;
-
-  const OptixResult result = optixInit();
-
-  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
-    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
-               "Please update to the latest driver first!";
-    return false;
-  }
-  else if (result != OPTIX_SUCCESS) {
-    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
-    return false;
-  }
-
-  // Loaded OptiX successfully!
-  return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
-  devices.reserve(cuda_devices.size());
-
-  // Simply add all supported CUDA devices as OptiX devices again
-  for (DeviceInfo info : cuda_devices) {
-    assert(info.type == DEVICE_CUDA);
-
-    int major;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-    if (major < 5) {
-      continue;  // Only Maxwell and up are supported by OptiX
-    }
-
-    info.type = DEVICE_OPTIX;
-    info.id += "_OptiX";
-    info.denoisers |= DENOISER_OPTIX;
-    info.has_branched_path = false;
-
-    devices.push_back(info);
-  }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+  DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+  if (VLOG_IS_ON(3)) {
+    /* Print kernel execution times sorted by time. */
+    vector<pair<DeviceKernelMask, double>> stats_sorted;
+    for (const auto &stat : stats_kernel_time_) {
+      stats_sorted.push_back(stat);
+    }
+
+    sort(stats_sorted.begin(),
+         stats_sorted.end(),
+         [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+           return a.second > b.second;
+         });
+
+    VLOG(3) << "GPU queue stats:";
+    for (const auto &[mask, time] : stats_sorted) {
+      VLOG(3) << "  " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+              << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+    }
+  }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+  if (VLOG_IS_ON(3)) {
+    last_sync_time_ = time_dt();
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+  if (VLOG_IS_ON(3)) {
+    VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+            << work_size;
+    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+  }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+  if (VLOG_IS_ON(3)) {
+    const double new_time = time_dt();
+    const double elapsed_time = new_time - last_sync_time_;
+    VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+    last_sync_time_ = new_time;
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+  virtual ~DeviceQueue();
+
+  /* Number of concurrent states to process for integrator,
+   * based on number of cores and/or available memory. */
+  virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+  /* Number of states which keeps the device occupied with work without loosing performance.
+   * The renderer will add more work (when available) when number of active paths falls below this
+   * value. */
+  virtual int num_concurrent_busy_states() const = 0;
+
+  /* Initialize execution of kernels on this queue.
+   *
+   * Will, for example, load all data required by the kernels from Device to global or path state.
+   *
+   * Use this method after device synchronization has finished before enqueueing any kernels. */
+  virtual void init_execution() = 0;
+
+  /* Test if an optional device kernel is available. */
+  virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+  /* Enqueue kernel execution.
+   *
+   * Execute the kernel work_size times on the device.
+   * Supported arguments types:
+   * - int: pass pointer to the int
+   * - device memory: pass pointer to device_memory.device_pointer
+   * Return false if there was an error executing this or a previous kernel. */
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+  /* Wait unit all enqueued kernels have finished execution.
+   * Return false if there was an error executing any of the enqueued kernels. */
+  virtual bool synchronize() = 0;
+
+  /* Copy memory to/from device as part of the command queue, to ensure
+   * operations are done in order without having to synchronize. */
+  virtual void zero_to_device(device_memory &mem) = 0;
+  virtual void copy_to_device(device_memory &mem) = 0;
+  virtual void copy_from_device(device_memory &mem) = 0;
+
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Create graphics interoperability context which will be taking care of mapping graphics
+   * resource as a buffer writable by kernels of this device. */
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+  {
+    LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Device this queue has been created for. */
+  Device *device;
+
+ protected:
+  /* Hide construction so that allocation via `Device` API is enforced. */
+  explicit DeviceQueue(Device *device);
+
+  /* Implementations call these from the corresponding methods to generate debugging logs. */
+  void debug_init_execution();
+  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_synchronize();
+
+  /* Combination of kernels enqueued together sync last synchronize. */
+  DeviceKernelMask last_kernels_enqueued_;
+  /* Time of synchronize call. */
+  double last_sync_time_;
+  /* Accumulated execution time for combinations of kernels launched together. */
+  map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
-    : device(device),
-      split_data(device, "split_data"),
-      ray_state(device, "ray_state", MEM_READ_WRITE),
-      queue_index(device, "queue_index"),
-      use_queues_flag(device, "use_queues_flag"),
-      work_pool_wgs(device, "work_pool_wgs"),
-      kernel_data_initialized(false)
-{
-  avg_time_per_sample = 0.0;
-
-  kernel_path_init = NULL;
-  kernel_scene_intersect = NULL;
-  kernel_lamp_emission = NULL;
-  kernel_do_volume = NULL;
-  kernel_queue_enqueue = NULL;
-  kernel_indirect_background = NULL;
-  kernel_shader_setup = NULL;
-  kernel_shader_sort = NULL;
-  kernel_shader_eval = NULL;
-  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-  kernel_subsurface_scatter = NULL;
-  kernel_direct_lighting = NULL;
-  kernel_shadow_blocked_ao = NULL;
-  kernel_shadow_blocked_dl = NULL;
-  kernel_enqueue_inactive = NULL;
-  kernel_next_iteration_setup = NULL;
-  kernel_indirect_subsurface = NULL;
-  kernel_buffer_update = NULL;
-  kernel_adaptive_stopping = NULL;
-  kernel_adaptive_filter_x = NULL;
-  kernel_adaptive_filter_y = NULL;
-  kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
-  split_data.free();
-  ray_state.free();
-  use_queues_flag.free();
-  queue_index.free();
-  work_pool_wgs.free();
-
-  delete kernel_path_init;
-  delete kernel_scene_intersect;
-  delete kernel_lamp_emission;
-  delete kernel_do_volume;
-  delete kernel_queue_enqueue;
-  delete kernel_indirect_background;
-  delete kernel_shader_setup;
-  delete kernel_shader_sort;
-  delete kernel_shader_eval;
-  delete kernel_holdout_emission_blurring_pathtermination_ao;
-  delete kernel_subsurface_scatter;
-  delete kernel_direct_lighting;
-  delete kernel_shadow_blocked_ao;
-  delete kernel_shadow_blocked_dl;
-  delete kernel_enqueue_inactive;
-  delete kernel_next_iteration_setup;
-  delete kernel_indirect_subsurface;
-  delete kernel_buffer_update;
-  delete kernel_adaptive_stopping;
-  delete kernel_adaptive_filter_x;
-  delete kernel_adaptive_filter_y;
-  delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
-  kernel_##name = get_split_kernel_function(#name, requested_features); \
-  if (!kernel_##name) { \
-    device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
-    return false; \
-  }
-
-  LOAD_KERNEL(path_init);
-  LOAD_KERNEL(scene_intersect);
-  LOAD_KERNEL(lamp_emission);
-  if (requested_features.use_volume) {
-    LOAD_KERNEL(do_volume);
-  }
-  LOAD_KERNEL(queue_enqueue);
-  LOAD_KERNEL(indirect_background);
-  LOAD_KERNEL(shader_setup);
-  LOAD_KERNEL(shader_sort);
-  LOAD_KERNEL(shader_eval);
-  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-  LOAD_KERNEL(subsurface_scatter);
-  LOAD_KERNEL(direct_lighting);
-  LOAD_KERNEL(shadow_blocked_ao);
-  LOAD_KERNEL(shadow_blocked_dl);
-  LOAD_KERNEL(enqueue_inactive);
-  LOAD_KERNEL(next_iteration_setup);
-  LOAD_KERNEL(indirect_subsurface);
-  LOAD_KERNEL(buffer_update);
-  LOAD_KERNEL(adaptive_stopping);
-  LOAD_KERNEL(adaptive_filter_x);
-  LOAD_KERNEL(adaptive_filter_y);
-  LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
-  /* Re-initialiaze kernel-dependent data when kernels change. */
-  kernel_data_initialized = false;
-
-  return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
-                                                           device_memory &data,
-                                                           uint64_t max_buffer_size)
-{
-  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
-          << " bytes. (" << string_human_readable_size(size_per_element) << ").";
-  return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
-                                   RenderTile &tile,
-                                   device_memory &kgbuffer,
-                                   device_memory &kernel_data)
-{
-  if (device->have_error()) {
-    return false;
-  }
-
-  /* Allocate all required global memory once. */
-  if (!kernel_data_initialized) {
-    kernel_data_initialized = true;
-
-    /* Set local size */
-    int2 lsize = split_kernel_local_size();
-    local_size[0] = lsize[0];
-    local_size[1] = lsize[1];
-
-    /* Set global size */
-    int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-    /* Make sure that set work size is a multiple of local
-     * work size dimensions.
-     */
-    global_size[0] = round_up(gsize[0], local_size[0]);
-    global_size[1] = round_up(gsize[1], local_size[1]);
-
-    int num_global_elements = global_size[0] * global_size[1];
-    assert(num_global_elements % WORK_POOL_SIZE == 0);
-
-    /* Calculate max groups */
-
-    /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-    unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
-                                                                      WORK_POOL_SIZE_GPU;
-    unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
-    /* Allocate work_pool_wgs memory. */
-    work_pool_wgs.alloc_to_device(max_work_groups);
-    queue_index.alloc_to_device(NUM_QUEUES);
-    use_queues_flag.alloc_to_device(1);
-    split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-    ray_state.alloc(num_global_elements);
-  }
-
-  /* Number of elements in the global state buffer */
-  int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-  if (device->have_error()) { \
-    return false; \
-  } \
-  if (!kernel_##name->enqueue( \
-          KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-    return false; \
-  }
-
-  tile.sample = tile.start_sample;
-
-  /* for exponential increase between tile updates */
-  int time_multiplier = 1;
-
-  while (tile.sample < tile.start_sample + tile.num_samples) {
-    /* to keep track of how long it takes to run a number of samples */
-    double start_time = time_dt();
-
-    /* initial guess to start rolling average */
-    const int initial_num_samples = 1;
-    /* approx number of samples per second */
-    const int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                       initial_num_samples;
-
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.sample;
-    subtile.num_samples = samples_per_second;
-
-    if (task.adaptive_sampling.use) {
-      subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
-                                                                 subtile.num_samples);
-    }
-
-    /* Don't go beyond requested number of samples. */
-    subtile.num_samples = min(subtile.num_samples,
-                              tile.start_sample + tile.num_samples - tile.sample);
-
-    if (device->have_error()) {
-      return false;
-    }
-
-    /* reset state memory here as global size for data_init
-     * kernel might not be large enough to do in kernel
-     */
-    work_pool_wgs.zero_to_device();
-    split_data.zero_to_device();
-    ray_state.zero_to_device();
-
-    if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                        subtile,
-                                        num_global_elements,
-                                        kgbuffer,
-                                        kernel_data,
-                                        split_data,
-                                        ray_state,
-                                        queue_index,
-                                        use_queues_flag,
-                                        work_pool_wgs)) {
-      return false;
-    }
-
-    ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-    bool activeRaysAvailable = true;
-    double cancel_time = DBL_MAX;
-
-    while (activeRaysAvailable) {
-      /* Do path-iteration in host [Enqueue Path-iteration kernels. */
-      for (int PathIter = 0; PathIter < 16; PathIter++) {
-        ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-        if (kernel_do_volume) {
-          ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-        }
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(
-            holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-        if (task.get_cancel() && cancel_time == DBL_MAX) {
-          /* Wait up to twice as many seconds for current samples to finish
-           * to avoid artifacts in render result from ending too soon.
-           */
-          cancel_time = time_dt() + 2.0 * time_multiplier;
-        }
-
-        if (time_dt() > cancel_time) {
-          return true;
-        }
-      }
-
-      /* Decide if we should exit path-iteration in host. */
-      ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
-      activeRaysAvailable = false;
-
-      for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-        if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
-          if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
-            /* Something went wrong, abort to avoid looping endlessly. */
-            device->set_error("Split kernel error: invalid ray state");
-            return false;
-          }
-
-          /* Not all rays are RAY_INACTIVE. */
-          activeRaysAvailable = true;
-          break;
-        }
-      }
-
-      if (time_dt() > cancel_time) {
-        return true;
-      }
-    }
-
-    int filter_sample = tile.sample + subtile.num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      size_t buffer_size[2];
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(tile.h, local_size[1]);
-      kernel_adaptive_stopping->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.h, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_x->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_y->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-    }
-
-    double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
-    if (avg_time_per_sample == 0.0) {
-      /* start rolling average */
-      avg_time_per_sample = time_per_sample;
-    }
-    else {
-      avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
-    }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
-    tile.sample += subtile.num_samples;
-    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
-    time_multiplier = min(time_multiplier << 1, 10);
-
-    if (task.get_cancel()) {
-      return true;
-    }
-  }
-
-  if (task.adaptive_sampling.use) {
-    /* Reset the start samples. */
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.start_sample;
-    subtile.num_samples = tile.sample - tile.start_sample;
-    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                   subtile,
-                                   num_global_elements,
-                                   kgbuffer,
-                                   kernel_data,
-                                   split_data,
-                                   ray_state,
-                                   queue_index,
-                                   use_queues_flag,
-                                   work_pool_wgs);
-    size_t buffer_size[2];
-    buffer_size[0] = round_up(tile.w, local_size[0]);
-    buffer_size[1] = round_up(tile.h, local_size[1]);
-    kernel_adaptive_adjust_samples->enqueue(
-        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-  }
-
-  return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000  // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
-  size_t global_size[2];
-  size_t local_size[2];
-
-  KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-  {
-    memcpy(global_size, global_size_, sizeof(global_size));
-    memcpy(local_size, local_size_, sizeof(local_size));
-  }
-};
-
-class SplitKernelFunction {
- public:
-  virtual ~SplitKernelFunction()
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
-  Device *device;
-
-  SplitKernelFunction *kernel_path_init;
-  SplitKernelFunction *kernel_scene_intersect;
-  SplitKernelFunction *kernel_lamp_emission;
-  SplitKernelFunction *kernel_do_volume;
-  SplitKernelFunction *kernel_queue_enqueue;
-  SplitKernelFunction *kernel_indirect_background;
-  SplitKernelFunction *kernel_shader_setup;
-  SplitKernelFunction *kernel_shader_sort;
-  SplitKernelFunction *kernel_shader_eval;
-  SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-  SplitKernelFunction *kernel_subsurface_scatter;
-  SplitKernelFunction *kernel_direct_lighting;
-  SplitKernelFunction *kernel_shadow_blocked_ao;
-  SplitKernelFunction *kernel_shadow_blocked_dl;
-  SplitKernelFunction *kernel_enqueue_inactive;
-  SplitKernelFunction *kernel_next_iteration_setup;
-  SplitKernelFunction *kernel_indirect_subsurface;
-  SplitKernelFunction *kernel_buffer_update;
-  SplitKernelFunction *kernel_adaptive_stopping;
-  SplitKernelFunction *kernel_adaptive_filter_x;
-  SplitKernelFunction *kernel_adaptive_filter_y;
-  SplitKernelFunction *kernel_adaptive_adjust_samples;
-
-  /* Global memory variables [porting]; These memory is used for
-   * co-operation between different kernels; Data written by one
-   * kernel will be available to another kernel via this global
-   * memory.
-   */
-  device_only_memory<uchar> split_data;
-  device_vector<uchar> ray_state;
-  device_only_memory<int>
-      queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
-  /* Flag to make sceneintersect and lampemission kernel use queues. */
-  device_only_memory<char> use_queues_flag;
-
-  /* Approximate time it takes to complete one sample */
-  double avg_time_per_sample;
-
-  /* Work pool with respect to each work group. */
-  device_only_memory<unsigned int> work_pool_wgs;
-
-  /* Cached kernel-dependent data, initialized once. */
-  bool kernel_data_initialized;
-  size_t local_size[2];
-  size_t global_size[2];
-
- public:
-  explicit DeviceSplitKernel(Device *device);
-  virtual ~DeviceSplitKernel();
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask &task,
-                  RenderTile &rtile,
-                  device_memory &kgbuffer,
-                  device_memory &kernel_data);
-
-  virtual uint64_t state_buffer_size(device_memory &kg,
-                                     device_memory &data,
-                                     size_t num_threads) = 0;
-  size_t max_elements_for_max_buffer_size(device_memory &kg,
-                                          device_memory &data,
-                                          uint64_t max_buffer_size);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs) = 0;
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &) = 0;
-  virtual int2 split_kernel_local_size() = 0;
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
-    : type(type_),
-      x(0),
-      y(0),
-      w(0),
-      h(0),
-      rgba_byte(0),
-      rgba_half(0),
-      buffer(0),
-      sample(0),
-      num_samples(1),
-      shader_input(0),
-      shader_output(0),
-      shader_eval_type(0),
-      shader_filter(0),
-      shader_x(0),
-      shader_w(0),
-      buffers(nullptr),
-      tile_types(0),
-      denoising_from_render(false),
-      pass_stride(0),
-      frame_stride(0),
-      target_pass_stride(0),
-      pass_denoising_data(0),
-      pass_denoising_clean(0),
-      need_finish_queue(false),
-      integrator_branched(false)
-{
-  last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
-  if (max_size != 0) {
-    int max_size_num;
-
-    if (type == SHADER) {
-      max_size_num = (shader_w + max_size - 1) / max_size;
-    }
-    else {
-      max_size = max(1, max_size / w);
-      max_size_num = (h + max_size - 1) / max_size;
-    }
-
-    num = max(max_size_num, num);
-  }
-
-  if (type == SHADER) {
-    num = min(shader_w, num);
-  }
-  else if (type == RENDER) {
-  }
-  else {
-    num = min(h, num);
-  }
-
-  return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
-  num = get_subtask_count(num, max_size);
-
-  if (type == SHADER) {
-    for (int i = 0; i < num; i++) {
-      int tx = shader_x + (shader_w / num) * i;
-      int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
-      DeviceTask task = *this;
-
-      task.shader_x = tx;
-      task.shader_w = tw;
-
-      tasks.push_back(task);
-    }
-  }
-  else if (type == RENDER) {
-    for (int i = 0; i < num; i++)
-      tasks.push_back(*this);
-  }
-  else {
-    for (int i = 0; i < num; i++) {
-      int ty = y + (h / num) * i;
-      int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
-      DeviceTask task = *this;
-
-      task.y = ty;
-      task.h = th;
-
-      tasks.push_back(task);
-    }
-  }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
-  if (type == FILM_CONVERT)
-    return;
-
-  if (update_progress_sample) {
-    if (pixel_samples == -1) {
-      pixel_samples = shader_w;
-    }
-    update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
-  }
-
-  if (update_tile_sample) {
-    double current_time = time_dt();
-
-    if (current_time - last_update_time >= 1.0) {
-      update_tile_sample(*rtile);
-
-      last_update_time = current_time;
-    }
-  }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
-  int end_sample = sample + num_samples;
-
-  /* Round down end sample to the nearest sample that needs filtering. */
-  end_sample &= ~(adaptive_step - 1);
-
-  if (end_sample <= sample) {
-    /* In order to reach the next sample that needs filtering, we'd need
-     * to increase num_samples. We don't do that in this function, so
-     * just keep it as is and don't filter this time around. */
-    return num_samples;
-  }
-  return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
-  if (sample > min_samples) {
-    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
-  }
-  else {
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
-  DENOISER_NLM = 1,
-  DENOISER_OPTIX = 2,
-  DENOISER_OPENIMAGEDENOISE = 4,
-  DENOISER_NUM,
-
-  DENOISER_NONE = 0,
-  DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
-  DENOISER_INPUT_RGB = 1,
-  DENOISER_INPUT_RGB_ALBEDO = 2,
-  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
-  DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
-  /* Apply denoiser to image. */
-  bool use;
-  /* Output denoising data passes (possibly without applying the denoiser). */
-  bool store_passes;
-
-  /* Denoiser type. */
-  DenoiserType type;
-
-  /* Viewport start sample. */
-  int start_sample;
-
-  /** Native Denoiser. */
-
-  /* Pixel radius for neighboring pixels to take into account. */
-  int radius;
-  /* Controls neighbor pixel weighting for the denoising filter. */
-  float strength;
-  /* Preserve more or less detail based on feature passes. */
-  float feature_strength;
-  /* When removing pixels that don't carry information,
-   * use a relative threshold instead of an absolute one. */
-  bool relative_pca;
-  /* How many frames before and after the current center frame are included. */
-  int neighbor_frames;
-  /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
-  bool clamp_input;
-
-  /** OIDN/Optix Denoiser. */
-
-  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
-  DenoiserInput input_passes;
-
-  DenoiseParams()
-  {
-    use = false;
-    store_passes = false;
-
-    type = DENOISER_NLM;
-
-    radius = 8;
-    strength = 0.5f;
-    feature_strength = 0.5f;
-    relative_pca = false;
-    neighbor_frames = 2;
-    clamp_input = true;
-
-    /* Default to color + albedo only, since normal input does not always have the desired effect
-     * when denoising with OptiX. */
-    input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
-    start_sample = 0;
-  }
-
-  /* Test if a denoising task needs to run, also to prefilter passes for the native
-   * denoiser when we are not applying denoising to the combined image. */
-  bool need_denoising_task() const
-  {
-    return (use || (store_passes && type == DENOISER_NLM));
-  }
-};
-
-class AdaptiveSampling {
- public:
-  AdaptiveSampling();
-
-  int align_samples(int sample, int num_samples) const;
-  bool need_filter(int sample) const;
-
-  bool use;
-  int adaptive_step;
-  int min_samples;
-};
-
-class DeviceTask {
- public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
-  Type type;
-
-  int x, y, w, h;
-  device_ptr rgba_byte;
-  device_ptr rgba_half;
-  device_ptr buffer;
-  int sample;
-  int num_samples;
-  int offset, stride;
-
-  device_ptr shader_input;
-  device_ptr shader_output;
-  int shader_eval_type;
-  int shader_filter;
-  int shader_x, shader_w;
-
-  RenderBuffers *buffers;
-
-  explicit DeviceTask(Type type = RENDER);
-
-  int get_subtask_count(int num, int max_size = 0) const;
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
-  void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
-  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
-  function<void(long, int)> update_progress_sample;
-  function<void(RenderTile &)> update_tile_sample;
-  function<void(RenderTile &)> release_tile;
-  function<bool()> get_cancel;
-  function<bool()> get_tile_stolen;
-  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
-  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
-  uint tile_types;
-  DenoiseParams denoising;
-  bool denoising_from_render;
-  vector<int> denoising_frames;
-
-  int pass_stride;
-  int frame_stride;
-  int target_pass_stride;
-  int pass_denoising_data;
-  int pass_denoising_clean;
-
-  bool need_finish_queue;
-  bool integrator_branched;
-  AdaptiveSampling adaptive_sampling;
-
- protected:
-  double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "device/dummy/device.h"
+
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
 
 class DummyDevice : public Device {
  public:
-  DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_)
+  DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_)
   {
     error_msg = info.error_msg;
   }
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
   virtual void const_copy_to(const char *, void *, size_t) override
   {
   }
-
-  virtual void task_add(DeviceTask &) override
-  {
-  }
-
-  virtual void task_wait() override
-  {
-  }
-
-  virtual void task_cancel() override
-  {
-  }
 };
 
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new DummyDevice(info, stats, profiler, background);
+  return new DummyDevice(info, stats, profiler);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/device/dummy/device.h
index 8afaa686e28..832a9568129 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/device/dummy/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_do_volume.h"
+#pragma once
 
-#define KERNEL_NAME do_volume
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+  struct SubDevice {
+    Stats stats;
+    Device *device;
+    map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
+  };
+
+  list<SubDevice> devices;
+  device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
+
+  MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+      : Device(info, stats, profiler), unique_key(1)
+  {
+    foreach (const DeviceInfo &subinfo, info.multi_devices) {
+      /* Always add CPU devices at the back since GPU devices can change
+       * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
+      if (subinfo.type == DEVICE_CPU) {
+        devices.emplace_back();
+        sub = &devices.back();
+      }
+      else {
+        devices.emplace_front();
+        sub = &devices.front();
+      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
+    }
+  }
+
+  ~MultiDevice()
+  {
+    foreach (SubDevice &sub, devices)
+      delete sub.device;
+  }
+
+  const string &error_message() override
+  {
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+
+    return error_msg;
+  }
+
+  virtual bool show_samples() const override
+  {
+    if (devices.size() > 1) {
+      return false;
+    }
+    return devices.front().device->show_samples();
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+    foreach (const SubDevice &sub_device, devices) {
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
+    }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
+    return bvh_layout_mask;
+  }
+
+  bool load_kernels(const uint kernel_features) override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_kernels(kernel_features))
+        return false;
+
+    return true;
+  }
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+  {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
+    /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
+    foreach (SubDevice &sub, devices) {
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
+    }
+
+    /* Change geometry BVH pointers back to the multi BVH. */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
+  }
+
+  virtual void *get_cpu_osl_memory() override
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->get_cpu_osl_memory();
+  }
+
+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
+  void mem_alloc(device_memory &mem) override
+  {
+    device_ptr key = unique_key++;
+
+    assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+    /* The remaining memory types can be distributed across devices */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      owner_sub->device->mem_alloc(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_copy_to(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+
+      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+        /* Need to create texture objects and update pointer in kernel globals on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_copy_to(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    device_ptr key = mem.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+      i++;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+  }
+
+  void mem_zero(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_zero(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_free(device_memory &mem) override
+  {
+    device_ptr key = mem.device_pointer;
+    size_t existing_size = mem.device_size;
+
+    /* Free memory that was allocated for all devices (see above) on each device */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_free(mem);
+      owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+      if (mem.type == MEM_TEXTURE) {
+        /* Free texture objects on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_free(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+    stats.mem_free(existing_size);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->const_copy_to(name, host, size);
+  }
+
+  int device_number(Device *sub_device) override
+  {
+    int i = 0;
+
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
+    return -1;
+  }
+
+  virtual void foreach_device(const function<void(Device *)> &callback) override
+  {
+    foreach (SubDevice &sub, devices) {
+      sub.device->foreach_device(callback);
+    }
+  }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/device/multi/device.h
index 192d01444ba..6e121014a1f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/device/multi/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_background.h"
+#pragma once
 
-#define KERNEL_NAME indirect_background
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-#  include "util/util_task.h"
-
-#  include "clew.h"
-
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-#  define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-#  ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#    undef clEnqueueNDRangeKernel
-#    define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueWriteBuffer
-#    define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueReadBuffer
-#    define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-#  endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-#  define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
-  OpenCLPlatformDevice(cl_platform_id platform_id,
-                       const string &platform_name,
-                       cl_device_id device_id,
-                       cl_device_type device_type,
-                       const string &device_name,
-                       const string &hardware_id,
-                       const string &device_extensions)
-      : platform_id(platform_id),
-        platform_name(platform_name),
-        device_id(device_id),
-        device_type(device_type),
-        device_name(device_name),
-        hardware_id(hardware_id),
-        device_extensions(device_extensions)
-  {
-  }
-  cl_platform_id platform_id;
-  string platform_name;
-  cl_device_id device_id;
-  cl_device_type device_type;
-  string device_name;
-  string hardware_id;
-  string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
-  static cl_device_type device_type();
-  static bool use_debug();
-  static bool device_supported(const string &platform_name, const cl_device_id device_id);
-  static bool platform_version_check(cl_platform_id platform, string *error = NULL);
-  static bool device_version_check(cl_device_id device, string *error = NULL);
-  static bool get_device_version(cl_device_id device,
-                                 int *r_major,
-                                 int *r_minor,
-                                 string *error = NULL);
-  static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
-  /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
-  /* Platform information. */
-  static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
-  static cl_uint get_num_platforms();
-
-  static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
-  static vector<cl_platform_id> get_platforms();
-
-  static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
-  static string get_platform_name(cl_platform_id platform_id);
-
-  static bool get_num_platform_devices(cl_platform_id platform_id,
-                                       cl_device_type device_type,
-                                       cl_uint *num_devices,
-                                       cl_int *error = NULL);
-  static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
-  static bool get_platform_devices(cl_platform_id platform_id,
-                                   cl_device_type device_type,
-                                   vector<cl_device_id> *device_ids,
-                                   cl_int *error = NULL);
-  static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
-                                                   cl_device_type device_type);
-
-  /* Device information. */
-  static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
-  static string get_device_name(cl_device_id device_id);
-
-  static bool get_device_extensions(cl_device_id device_id,
-                                    string *device_extensions,
-                                    cl_int *error = NULL);
-
-  static string get_device_extensions(cl_device_id device_id);
-
-  static bool get_device_type(cl_device_id device_id,
-                              cl_device_type *device_type,
-                              cl_int *error = NULL);
-  static cl_device_type get_device_type(cl_device_id device_id);
-
-  static bool get_driver_version(cl_device_id device_id,
-                                 int *major,
-                                 int *minor,
-                                 cl_int *error = NULL);
-
-  static int mem_sub_ptr_alignment(cl_device_id device_id);
-
-  /* Get somewhat more readable device name.
-   * Main difference is AMD OpenCL here which only gives code name
-   * for the regular device name. This will give more sane device
-   * name using some extensions.
-   */
-  static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
-  struct Slot {
-    struct ProgramEntry {
-      ProgramEntry();
-      ProgramEntry(const ProgramEntry &rhs);
-      ~ProgramEntry();
-      cl_program program;
-      thread_mutex *mutex;
-    };
-
-    Slot();
-    Slot(const Slot &rhs);
-    ~Slot();
-
-    thread_mutex *context_mutex;
-    cl_context context;
-    typedef map<ustring, ProgramEntry> EntryMap;
-    EntryMap programs;
-  };
-
-  /* key is combination of platform ID and device ID */
-  typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-  /* map of Slot objects */
-  typedef map<PlatformDevicePair, Slot> CacheMap;
-  CacheMap cache;
-
-  /* MD5 hash of the kernel source. */
-  string kernel_md5;
-
-  thread_mutex cache_lock;
-  thread_mutex kernel_md5_lock;
-
-  /* lazy instantiate */
-  static OpenCLCache &global_instance();
-
- public:
-  enum ProgramName {
-    OCL_DEV_BASE_PROGRAM,
-    OCL_DEV_MEGAKERNEL_PROGRAM,
-  };
-
-  /* Lookup context in the cache. If this returns NULL, slot_locker
-   * will be holding a lock for the cache. slot_locker should refer to a
-   * default constructed thread_scoped_lock. */
-  static cl_context get_context(cl_platform_id platform,
-                                cl_device_id device,
-                                thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static cl_program get_program(cl_platform_id platform,
-                                cl_device_id device,
-                                ustring key,
-                                thread_scoped_lock &slot_locker);
-
-  /* Store context in the cache. You MUST have tried to get the item before storing to it. */
-  static void store_context(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_context context,
-                            thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static void store_program(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_program program,
-                            ustring key,
-                            thread_scoped_lock &slot_locker);
-
-  static string get_kernel_md5();
-};
-
-#  define opencl_device_assert(device, stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if ((device)->error_message() == "") { \
-          (device)->set_error(message); \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-#  define opencl_assert(stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if (error_msg == "") { \
-          error_msg = message; \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-class OpenCLDevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-
-  /* Task pool for required kernels (base, AO kernels during foreground rendering) */
-  TaskPool load_required_kernel_task_pool;
-  /* Task pool for optional kernels (feature kernels during foreground rendering) */
-  TaskPool load_kernel_task_pool;
-  std::atomic<int> load_kernel_num_compiling;
-
-  cl_context cxContext;
-  cl_command_queue cqCommandQueue;
-  cl_platform_id cpPlatform;
-  cl_device_id cdDevice;
-  cl_int ciErr;
-  int device_num;
-
-  class OpenCLProgram {
-   public:
-    OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
-    {
-    }
-    OpenCLProgram(OpenCLDevice *device,
-                  const string &program_name,
-                  const string &kernel_name,
-                  const string &kernel_build_options,
-                  bool use_stdout = true);
-    ~OpenCLProgram();
-
-    void add_kernel(ustring name);
-
-    /* Try to load the program from device cache or disk */
-    bool load();
-    /* Compile the kernel (first separate, fail-back to local). */
-    void compile();
-    /* Create the OpenCL kernels after loading or compiling */
-    void create_kernels();
-
-    bool is_loaded() const
-    {
-      return loaded;
-    }
-    const string &get_log() const
-    {
-      return log;
-    }
-    void report_error();
-
-    /* Wait until this kernel is available to be used
-     * It will return true when the kernel is available.
-     * It will return false when the kernel is not available
-     * or could not be loaded. */
-    bool wait_for_availability();
-
-    cl_kernel operator()();
-    cl_kernel operator()(ustring name);
-
-    void release();
-
-   private:
-    bool build_kernel(const string *debug_src);
-    /* Build the program by calling the own process.
-     * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
-     * build calls internally if they come from the same process.
-     * If that is not supported, this function just returns false.
-     */
-    bool compile_separate(const string &clbin);
-    /* Build the program by calling OpenCL directly. */
-    bool compile_kernel(const string *debug_src);
-    /* Loading and saving the program from/to disk. */
-    bool load_binary(const string &clbin, const string *debug_src = NULL);
-    bool save_binary(const string &clbin);
-
-    void add_log(const string &msg, bool is_debug);
-    void add_error(const string &msg);
-
-    bool loaded;
-    bool needs_compiling;
-
-    cl_program program;
-    OpenCLDevice *device;
-
-    /* Used for the OpenCLCache key. */
-    string program_name;
-
-    string kernel_file, kernel_build_options, device_md5;
-
-    bool use_stdout;
-    string log, error_msg;
-    string compile_output;
-
-    map<ustring, cl_kernel> kernels;
-  };
-
-  /* Container for all types of split programs. */
-  class OpenCLSplitPrograms {
-   public:
-    OpenCLDevice *device;
-    OpenCLProgram program_split;
-    OpenCLProgram program_lamp_emission;
-    OpenCLProgram program_do_volume;
-    OpenCLProgram program_indirect_background;
-    OpenCLProgram program_shader_eval;
-    OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-    OpenCLProgram program_subsurface_scatter;
-    OpenCLProgram program_direct_lighting;
-    OpenCLProgram program_shadow_blocked_ao;
-    OpenCLProgram program_shadow_blocked_dl;
-
-    OpenCLSplitPrograms(OpenCLDevice *device);
-    ~OpenCLSplitPrograms();
-
-    /* Load the kernels and put the created kernels in the given
-     * `programs` parameter. */
-    void load_kernels(vector<OpenCLProgram *> &programs,
-                      const DeviceRequestedFeatures &requested_features);
-  };
-
-  DeviceSplitKernel *split_kernel;
-
-  OpenCLProgram base_program;
-  OpenCLProgram bake_program;
-  OpenCLProgram displace_program;
-  OpenCLProgram background_program;
-  OpenCLProgram denoising_program;
-
-  OpenCLSplitPrograms kernel_programs;
-
-  typedef map<string, device_vector<uchar> *> ConstMemMap;
-  typedef map<string, device_ptr> MemMap;
-
-  ConstMemMap const_mem_map;
-  MemMap mem_map;
-
-  bool device_initialized;
-  string platform_name;
-  string device_name;
-
-  bool opencl_error(cl_int err);
-  void opencl_error(const string &message);
-  void opencl_assert_err(cl_int err, const char *where);
-
-  OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-  ~OpenCLDevice();
-
-  static void CL_CALLBACK context_notify_callback(const char *err_info,
-                                                  const void * /*private_info*/,
-                                                  size_t /*cb*/,
-                                                  void *user_data);
-
-  bool opencl_version_check();
-  OpenCLSplitPrograms *get_split_programs();
-
-  string device_md5_hash(string kernel_custom_build_options = "");
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
-  DeviceKernelStatus get_active_kernel_switch_state();
-
-  /* Get the name of the opencl program for the given kernel */
-  const string get_opencl_program_name(const string &kernel_name);
-  /* Get the program file name to compile (*.cl) for the given kernel */
-  const string get_opencl_program_filename(const string &kernel_name);
-  string get_build_options(const DeviceRequestedFeatures &requested_features,
-                           const string &opencl_program_name);
-  /* Enable the default features to reduce recompilation events */
-  void enable_default_features(DeviceRequestedFeatures &features);
-
-  void mem_alloc(device_memory &mem);
-  void mem_copy_to(device_memory &mem);
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
-  void mem_zero(device_memory &mem);
-  void mem_free(device_memory &mem);
-
-  int mem_sub_ptr_alignment();
-
-  void const_copy_to(const char *name, void *host, size_t size);
-  void global_alloc(device_memory &mem);
-  void global_free(device_memory &mem);
-  void tex_alloc(device_texture &mem);
-  void tex_free(device_texture &mem);
-
-  size_t global_size_round_up(int group_size, int global_size);
-  void enqueue_kernel(cl_kernel kernel,
-                      size_t w,
-                      size_t h,
-                      bool x_workgroups = false,
-                      size_t max_workgroup_size = -1);
-  void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-  void shader(DeviceTask &task);
-  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
-  void bake(DeviceTask &task, RenderTile &tile);
-
-  void denoise(RenderTile &tile, DenoisingTask &denoising);
-
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  void thread_run(DeviceTask &task);
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  virtual bool show_samples() const
-  {
-    return true;
-  }
-
- protected:
-  string kernel_build_options(const string *debug_src = NULL);
-
-  void mem_zero_kernel(device_ptr ptr, size_t size);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-  bool denoising_construct_transform(DenoisingTask *task);
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-  bool denoising_write_feature(int to_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
-  void mem_free_sub_ptr(device_ptr ptr);
-
-  class ArgumentWrapper {
-   public:
-    ArgumentWrapper() : size(0), pointer(NULL)
-    {
-    }
-
-    ArgumentWrapper(device_memory &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_vector<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_only_memory<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-    template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
-    {
-    }
-
-    ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
-    {
-    }
-
-    ArgumentWrapper(float argument)
-        : size(sizeof(float)), float_value(argument), pointer(&float_value)
-    {
-    }
-
-    size_t size;
-    int int_value;
-    float float_value;
-    void *pointer;
-  };
-
-  /* TODO(sergey): In the future we can use variadic templates, once
-   * C++0x is allowed. Should allow to clean this up a bit.
-   */
-  int kernel_set_args(cl_kernel kernel,
-                      int start_argument_index,
-                      const ArgumentWrapper &arg1 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg2 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg3 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg4 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg5 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg6 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg7 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg8 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg9 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg10 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg11 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg12 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg13 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg14 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg15 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg16 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg17 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg18 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg19 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg20 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg21 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg22 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg23 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg24 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg25 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg26 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg27 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg28 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg29 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg30 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg31 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg32 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg33 = ArgumentWrapper());
-
-  void release_kernel_safe(cl_kernel kernel);
-  void release_mem_object_safe(cl_mem mem);
-  void release_program_safe(cl_program program);
-
-  /* ** Those guys are for working around some compiler-specific bugs ** */
-
-  cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
-  void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
-  MemoryManager memory_manager;
-  friend class MemoryManager;
-
-  static_assert_align(TextureInfo, 16);
-  device_vector<TextureInfo> texture_info;
-
-  typedef map<string, device_memory *> TexturesMap;
-  TexturesMap textures;
-
-  bool textures_need_update;
-
- protected:
-  void flush_texture_buffers();
-
-  friend class OpenCLSplitKernel;
-  friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-
-#  include "kernel/kernel_types.h"
-#  include "kernel/split/kernel_split_data_types.h"
-
-#  include "util/util_algorithm.h"
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
-  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
-  {
-  }
-  string name;
-  int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
-    "denoising "
-    "base "
-    "background "
-    "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
-    "data_init "
-    "path_init "
-    "state_buffer_size "
-    "scene_intersect "
-    "queue_enqueue "
-    "shader_setup "
-    "shader_sort "
-    "enqueue_inactive "
-    "next_iteration_setup "
-    "indirect_subsurface "
-    "buffer_update "
-    "adaptive_stopping "
-    "adaptive_filter_x "
-    "adaptive_filter_y "
-    "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
-  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
-    return kernel_name;
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "split_bundle";
-  }
-  else {
-    return "split_" + kernel_name;
-  }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
-  if (kernel_name == "denoising") {
-    return "filter.cl";
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "kernel_split_bundle.cl";
-  }
-  else {
-    return "kernel_" + kernel_name + ".cl";
-  }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
-  features.use_transparent = true;
-  features.use_shadow_tricks = true;
-  features.use_principled = true;
-  features.use_denoising = true;
-
-  if (!background) {
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.use_camera_motion = false;
-    features.use_object_motion = false;
-  }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
-                                       const string &opencl_program_name)
-{
-  /* first check for non-split kernel programs */
-  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
-    return "";
-  }
-  else if (opencl_program_name == "bake") {
-    /* Note: get_build_options for bake is only requested when baking is enabled.
-     * displace and background are always requested.
-     * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "displace") {
-    /* As displacement does not use any nodes from the Shading group (eg BSDF).
-     * We disable all features that are related to shading. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_baking = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_denoising = false;
-    features.use_principled = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "background") {
-    /* Background uses Background shading
-     * It is save to disable shadow features, subsurface and volumetric. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_baking = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_denoising = false;
-    /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-     * Perhaps we should remove them in UI as it does not make any sense when
-     * rendering background. */
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.use_shader_raytrace = false;
-    features.use_patch_evaluation = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-
-  string build_options = "-D__SPLIT_KERNEL__ ";
-  /* Set compute device build option. */
-  cl_device_type device_type;
-  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
-  assert(this->ciErr == CL_SUCCESS);
-  if (device_type == CL_DEVICE_TYPE_GPU) {
-    build_options += "-D__COMPUTE_DEVICE_GPU__ ";
-  }
-
-  DeviceRequestedFeatures nofeatures;
-  enable_default_features(nofeatures);
-
-  /* Add program specific optimized compile directives */
-  if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-    build_options += nofeatures.get_build_options();
-  }
-  else {
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-
-    /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
-     * this also makes sure that the kernels that are build during baking can be reused
-     * when not doing any baking. */
-    features.use_baking = false;
-
-    /* Do not vary on shaders when program doesn't do any shading.
-     * We have bundled them in a single program. */
-    if (opencl_program_name == "split_bundle") {
-      features.max_nodes_group = 0;
-      features.nodes_features = 0;
-      features.use_shader_raytrace = false;
-    }
-
-    /* No specific settings, just add the regular ones */
-    build_options += features.get_build_options();
-  }
-
-  return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
-  device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
-  program_split.release();
-  program_lamp_emission.release();
-  program_do_volume.release();
-  program_indirect_background.release();
-  program_shader_eval.release();
-  program_holdout_emission_blurring_pathtermination_ao.release();
-  program_subsurface_scatter.release();
-  program_direct_lighting.release();
-  program_shadow_blocked_ao.release();
-  program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
-    vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
-  if (!requested_features.use_baking) {
-#  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
-    program_split.add_kernel(ustring("path_trace_" #kernel_name));
-#  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
-    const string program_name_##kernel_name = "split_" #kernel_name; \
-    program_##kernel_name = OpenCLDevice::OpenCLProgram( \
-        device, \
-        program_name_##kernel_name, \
-        "kernel_" #kernel_name ".cl", \
-        device->get_build_options(requested_features, program_name_##kernel_name)); \
-    program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
-    programs.push_back(&program_##kernel_name);
-
-    /* Ordered with most complex kernels first, to reduce overall compile time. */
-    ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    if (requested_features.use_volume) {
-      ADD_SPLIT_KERNEL_PROGRAM(do_volume);
-    }
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
-    /* Quick kernels bundled in a single program to reduce overhead of starting
-     * Blender processes. */
-    program_split = OpenCLDevice::OpenCLProgram(
-        device,
-        "split_bundle",
-        "kernel_split_bundle.cl",
-        device->get_build_options(requested_features, "split_bundle"));
-
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
-    programs.push_back(&program_split);
-
-#  undef ADD_SPLIT_KERNEL_PROGRAM
-#  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
-  }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  SplitData split_data;
-  SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-}  // namespace
-
-struct CachedSplitMemory {
-  int id;
-  device_memory *split_data;
-  device_memory *ray_state;
-  device_memory *queue_index;
-  device_memory *use_queues_flag;
-  device_memory *work_pools;
-  device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
-  OpenCLDevice *device;
-  OpenCLDevice::OpenCLProgram program;
-  CachedSplitMemory &cached_memory;
-  int cached_id;
-
-  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
-      : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
-  {
-  }
-
-  ~OpenCLSplitKernelFunction()
-  {
-    program.release();
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
-  {
-    if (cached_id != cached_memory.id) {
-      cl_uint start_arg_index = device->kernel_set_args(
-          program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
-      device->set_kernel_arg_buffers(program(), &start_arg_index);
-
-      start_arg_index += device->kernel_set_args(program(),
-                                                 start_arg_index,
-                                                 *cached_memory.queue_index,
-                                                 *cached_memory.use_queues_flag,
-                                                 *cached_memory.work_pools,
-                                                 *cached_memory.buffer);
-
-      cached_id = cached_memory.id;
-    }
-
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           program(),
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    return true;
-  }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
-  OpenCLDevice *device;
-  CachedSplitMemory cached_memory;
-
- public:
-  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
-  {
-  }
-
-  virtual SplitKernelFunction *get_split_kernel_function(
-      const string &kernel_name, const DeviceRequestedFeatures &requested_features)
-  {
-    OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
-    const string program_name = device->get_opencl_program_name(kernel_name);
-    kernel->program = OpenCLDevice::OpenCLProgram(
-        device,
-        program_name,
-        device->get_opencl_program_filename(kernel_name),
-        device->get_build_options(requested_features, program_name));
-
-    kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
-    kernel->program.load();
-
-    if (!kernel->program.is_loaded()) {
-      delete kernel;
-      return NULL;
-    }
-
-    return kernel;
-  }
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
-  {
-    device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-    size_buffer.alloc(1);
-    size_buffer.zero_to_device();
-
-    uint threads = num_threads;
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_state_buffer_size = programs->program_split(
-        ustring("path_trace_state_buffer_size"));
-    device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
-    size_t global_size = 64;
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_state_buffer_size,
-                                           1,
-                                           NULL,
-                                           &global_size,
-                                           NULL,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    size_buffer.copy_from_device(0, 1, 1);
-    size_t size = size_buffer[0];
-    size_buffer.free();
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return 0;
-    }
-
-    return size;
-  }
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs)
-  {
-    cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
-    /* Set the range of samples to be processed for every ray in
-     * path-regeneration logic.
-     */
-    cl_int start_sample = rtile.start_sample;
-    cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
-    cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
-                                                      0,
-                                                      kernel_globals,
-                                                      kernel_data,
-                                                      split_data,
-                                                      num_global_elements,
-                                                      ray_state);
-
-    device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
-    start_arg_index += device->kernel_set_args(kernel_data_init,
-                                               start_arg_index,
-                                               start_sample,
-                                               end_sample,
-                                               rtile.x,
-                                               rtile.y,
-                                               rtile.w,
-                                               rtile.h,
-                                               rtile.offset,
-                                               rtile.stride,
-                                               queue_index,
-                                               dQueue_size,
-                                               use_queues_flag,
-                                               work_pool_wgs,
-                                               rtile.num_samples,
-                                               rtile.buffer);
-
-    /* Enqueue ckPathTraceKernel_data_init kernel. */
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_data_init,
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    cached_memory.split_data = &split_data;
-    cached_memory.ray_state = &ray_state;
-    cached_memory.queue_index = &queue_index;
-    cached_memory.use_queues_flag = &use_queues_flag;
-    cached_memory.work_pools = &work_pool_wgs;
-    cached_memory.buffer = &rtile.buffer;
-    cached_memory.id++;
-
-    return true;
-  }
-
-  virtual int2 split_kernel_local_size()
-  {
-    return make_int2(64, 1);
-  }
-
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask & /*task*/)
-  {
-    cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
-    /* Use small global size on CPU devices as it seems to be much faster. */
-    if (type == CL_DEVICE_TYPE_CPU) {
-      VLOG(1) << "Global size: (64, 64).";
-      return make_int2(64, 64);
-    }
-
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (DebugFlags().opencl.mem_limit) {
-      max_buffer_size = min(max_buffer_size,
-                            cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
-    }
-
-    VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
-            << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
-    /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
-    max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
-    size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
-    int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
-                                 (int)sqrt(num_elements));
-
-    if (device->info.description.find("Intel") != string::npos) {
-      global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
-    }
-
-    VLOG(1) << "Global size: " << global_size << ".";
-    return global_size;
-  }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    return true;
-  }
-
-  return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf(
-        "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-#  ifndef NDEBUG
-    abort();
-#  endif
-  }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-    : Device(info, stats, profiler, background),
-      load_kernel_num_compiling(0),
-      kernel_programs(this),
-      memory_manager(this),
-      texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  cpPlatform = NULL;
-  cdDevice = NULL;
-  cxContext = NULL;
-  cqCommandQueue = NULL;
-  device_initialized = false;
-  textures_need_update = true;
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (usable_devices.size() == 0) {
-    opencl_error("OpenCL: no devices found.");
-    return;
-  }
-  assert(info.num < usable_devices.size());
-  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
-  device_num = info.num;
-  cpPlatform = platform_device.platform_id;
-  cdDevice = platform_device.device_id;
-  platform_name = platform_device.platform_name;
-  device_name = platform_device.device_name;
-  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
-          << device_name << ".";
-
-  {
-    /* try to use cached context */
-    thread_scoped_lock cache_locker;
-    cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-    if (cxContext == NULL) {
-      /* create context properties array to specify platform */
-      const cl_context_properties context_props[] = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
-      /* create context */
-      cxContext = clCreateContext(
-          context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
-      if (opencl_error(ciErr)) {
-        opencl_error("OpenCL: clCreateContext failed");
-        return;
-      }
-
-      /* cache it */
-      OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-    }
-  }
-
-  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-  if (opencl_error(ciErr)) {
-    opencl_error("OpenCL: Error creating command queue");
-    return;
-  }
-
-  /* Allocate this right away so that texture_info
-   * is placed at offset 0 in the device memory buffers. */
-  texture_info.resize(1);
-  memory_manager.alloc("texture_info", texture_info);
-
-  device_initialized = true;
-
-  split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
-  task_pool.cancel();
-  load_required_kernel_task_pool.cancel();
-  load_kernel_task_pool.cancel();
-
-  memory_manager.free();
-
-  ConstMemMap::iterator mt;
-  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-    delete mt->second;
-  }
-
-  base_program.release();
-  bake_program.release();
-  displace_program.release();
-  background_program.release();
-  denoising_program.release();
-
-  if (cqCommandQueue)
-    clReleaseCommandQueue(cqCommandQueue);
-  if (cxContext)
-    clReleaseContext(cxContext);
-
-  delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
-                                                       const void * /*private_info*/,
-                                                       size_t /*cb*/,
-                                                       void *user_data)
-{
-  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
-  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
-  string error;
-  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
-  MD5Hash md5;
-  char version[256], driver[256], name[256], vendor[256];
-
-  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
-  md5.append((uint8_t *)vendor, strlen(vendor));
-  md5.append((uint8_t *)version, strlen(version));
-  md5.append((uint8_t *)name, strlen(name));
-  md5.append((uint8_t *)driver, strlen(driver));
-
-  string options = kernel_build_options();
-  options += kernel_custom_build_options;
-  md5.append((uint8_t *)options.c_str(), options.size());
-
-  return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
-  /* Verify if device was initialized. */
-  if (!device_initialized) {
-    fprintf(stderr, "OpenCL: failed to initialize device.\n");
-    return false;
-  }
-
-  /* Verify we have right opencl version. */
-  if (!opencl_version_check())
-    return false;
-
-  load_required_kernels(requested_features);
-
-  vector<OpenCLProgram *> programs;
-  kernel_programs.load_kernels(programs, requested_features);
-
-  if (!requested_features.use_baking && requested_features.use_denoising) {
-    denoising_program = OpenCLProgram(
-        this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-    denoising_program.add_kernel(ustring("filter_divide_shadow"));
-    denoising_program.add_kernel(ustring("filter_get_feature"));
-    denoising_program.add_kernel(ustring("filter_write_feature"));
-    denoising_program.add_kernel(ustring("filter_detect_outliers"));
-    denoising_program.add_kernel(ustring("filter_combine_halves"));
-    denoising_program.add_kernel(ustring("filter_construct_transform"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-    denoising_program.add_kernel(ustring("filter_nlm_blur"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-    denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-    denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-    denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-    denoising_program.add_kernel(ustring("filter_finalize"));
-    programs.push_back(&denoising_program);
-  }
-
-  load_required_kernel_task_pool.wait_work();
-
-  /* Parallel compilation of Cycles kernels, this launches multiple
-   * processes to workaround OpenCL frameworks serializing the calls
-   * internally within a single process. */
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_kernel_num_compiling++;
-      load_kernel_task_pool.push([=] {
-        program->compile();
-        load_kernel_num_compiling--;
-      });
-    }
-  }
-  return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  vector<OpenCLProgram *> programs;
-  base_program = OpenCLProgram(
-      this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-  base_program.add_kernel(ustring("convert_to_byte"));
-  base_program.add_kernel(ustring("convert_to_half_float"));
-  base_program.add_kernel(ustring("zero_buffer"));
-  programs.push_back(&base_program);
-
-  if (requested_features.use_true_displacement) {
-    displace_program = OpenCLProgram(
-        this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
-    displace_program.add_kernel(ustring("displace"));
-    programs.push_back(&displace_program);
-  }
-
-  if (requested_features.use_background_light) {
-    background_program = OpenCLProgram(this,
-                                       "background",
-                                       "kernel_background.cl",
-                                       get_build_options(requested_features, "background"));
-    background_program.add_kernel(ustring("background"));
-    programs.push_back(&background_program);
-  }
-
-  if (requested_features.use_baking) {
-    bake_program = OpenCLProgram(
-        this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
-    bake_program.add_kernel(ustring("bake"));
-    programs.push_back(&bake_program);
-  }
-
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-    }
-  }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
-  if (requested_features.use_baking) {
-    /* For baking, kernels have already been loaded in load_required_kernels(). */
-    return true;
-  }
-
-  load_kernel_task_pool.wait_work();
-  return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
-  return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
-  return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-  }
-
-  size_t size = mem.memory_size();
-
-  /* check there is enough memory available for the allocation */
-  cl_ulong max_alloc_size = 0;
-  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
-  if (DebugFlags().opencl.mem_limit) {
-    max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
-  }
-
-  if (size > max_alloc_size) {
-    string error = "Scene too complex to fit in available memory.";
-    if (mem.name != NULL) {
-      error += string_printf(" (allocating buffer %s failed.)", mem.name);
-    }
-    set_error(error);
-
-    return;
-  }
-
-  cl_mem_flags mem_flag;
-  void *mem_ptr = NULL;
-
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  /* Zero-size allocation might be invoked by render, but not really
-   * supported by OpenCL. Using NULL as device pointer also doesn't really
-   * work for some reason, so for the time being we'll use special case
-   * will null_mem buffer.
-   */
-  if (size != 0) {
-    mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
-    opencl_assert_err(ciErr, "clCreateBuffer");
-  }
-  else {
-    mem.device_pointer = 0;
-  }
-
-  stats.mem_alloc(size);
-  mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    /* this is blocking */
-    size_t size = mem.memory_size();
-    if (size != 0) {
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         size,
-                                         mem.host_pointer,
-                                         0,
-                                         NULL,
-                                         NULL));
-    }
-  }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  size_t offset = elem * y * w;
-  size_t size = elem * w * h;
-  assert(size != 0);
-  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-                                    CL_MEM_PTR(mem.device_pointer),
-                                    CL_TRUE,
-                                    offset,
-                                    size,
-                                    (uchar *)mem.host_pointer + offset,
-                                    0,
-                                    NULL,
-                                    NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
-  base_program.wait_for_availability();
-  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-  size_t global_size[] = {1024, 1024};
-  size_t num_threads = global_size[0] * global_size[1];
-
-  cl_mem d_buffer = CL_MEM_PTR(mem);
-  cl_ulong d_offset = 0;
-  cl_ulong d_size = 0;
-
-  while (d_offset < size) {
-    d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
-    kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-    ciErr = clEnqueueNDRangeKernel(
-        cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
-    opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-    d_offset += d_size;
-  }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-
-  if (mem.device_pointer) {
-    if (base_program.is_loaded()) {
-      mem_zero_kernel(mem.device_pointer, mem.memory_size());
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    if (!base_program.is_loaded()) {
-      void *zero = mem.host_pointer;
-
-      if (!mem.host_pointer) {
-        zero = util_aligned_malloc(mem.memory_size(), 16);
-        memset(zero, 0, mem.memory_size());
-      }
-
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         mem.memory_size(),
-                                         zero,
-                                         0,
-                                         NULL,
-                                         NULL));
-
-      if (!mem.host_pointer) {
-        util_aligned_free(zero);
-      }
-    }
-  }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    if (mem.device_pointer) {
-      if (mem.device_pointer != 0) {
-        opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-      }
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
-  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
-  cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  cl_buffer_region info;
-  info.origin = mem.memory_elements_size(offset);
-  info.size = mem.memory_elements_size(size);
-
-  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
-      CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
-  opencl_assert_err(ciErr, "clCreateSubBuffer");
-  return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
-  if (device_pointer != 0) {
-    opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
-  }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  ConstMemMap::iterator i = const_mem_map.find(name);
-  device_vector<uchar> *data;
-
-  if (i == const_mem_map.end()) {
-    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
-    data->alloc(size);
-    const_mem_map.insert(ConstMemMap::value_type(name, data));
-  }
-  else {
-    data = i->second;
-  }
-
-  memcpy(data->data(), host, size);
-  data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
-  VLOG(1) << "Global memory allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    mem.device_pointer = 0;
-
-    if (memory_manager.free(mem)) {
-      textures_need_update = true;
-    }
-
-    foreach (TexturesMap::value_type &value, textures) {
-      if (value.second == &mem) {
-        textures.erase(value.first);
-        break;
-      }
-    }
-  }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
-  global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
-  int r = global_size % group_size;
-  return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
-    cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
-  size_t workgroup_size, max_work_items[3];
-
-  clGetKernelWorkGroupInfo(
-      kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-  clGetDeviceInfo(
-      cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
-  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
-    workgroup_size = max_workgroup_size;
-  }
-
-  /* Try to divide evenly over 2 dimensions. */
-  size_t local_size[2];
-  if (x_workgroups) {
-    local_size[0] = workgroup_size;
-    local_size[1] = 1;
-  }
-  else {
-    size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-    local_size[0] = local_size[1] = sqrt_workgroup_size;
-  }
-
-  /* Some implementations have max size 1 on 2nd dimension. */
-  if (local_size[1] > max_work_items[1]) {
-    local_size[0] = workgroup_size / max_work_items[1];
-    local_size[1] = max_work_items[1];
-  }
-
-  size_t global_size[2] = {global_size_round_up(local_size[0], w),
-                           global_size_round_up(local_size[1], h)};
-
-  /* Vertical size of 1 is coming from bake/shade kernels where we should
-   * not round anything up because otherwise we'll either be doing too
-   * much work per pixel (if we don't check global ID on Y axis) or will
-   * be checking for global ID to always have Y of 0.
-   */
-  if (h == 1) {
-    global_size[h] = 1;
-  }
-
-  /* run kernel */
-  opencl_assert(
-      clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-  opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
-  cl_mem ptr;
-
-  MemMap::iterator i = mem_map.find(name);
-  if (i != mem_map.end()) {
-    ptr = CL_MEM_PTR(i->second);
-  }
-  else {
-    ptr = 0;
-  }
-
-  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  flush_texture_buffers();
-
-  memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
-  if (!textures_need_update) {
-    return;
-  }
-  textures_need_update = false;
-
-  /* Setup slots for textures. */
-  int num_slots = 0;
-
-  vector<texture_slot_t> texture_slots;
-
-#  define KERNEL_TEX(type, name) \
-    if (textures.find(#name) != textures.end()) { \
-      texture_slots.push_back(texture_slot_t(#name, num_slots)); \
-    } \
-    num_slots++;
-#  include "kernel/kernel_textures.h"
-
-  int num_data_slots = num_slots;
-
-  foreach (TexturesMap::value_type &tex, textures) {
-    string name = tex.first;
-    device_memory *mem = tex.second;
-
-    if (mem->type == MEM_TEXTURE) {
-      const uint id = ((device_texture *)mem)->slot;
-      texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
-      num_slots = max(num_slots, num_data_slots + id + 1);
-    }
-  }
-
-  /* Realloc texture descriptors buffer. */
-  memory_manager.free(texture_info);
-  texture_info.resize(num_slots);
-  memory_manager.alloc("texture_info", texture_info);
-
-  /* Fill in descriptors */
-  foreach (texture_slot_t &slot, texture_slots) {
-    device_memory *mem = textures[slot.name];
-    TextureInfo &info = texture_info[slot.slot];
-
-    MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
-    if (mem->type == MEM_TEXTURE) {
-      info = ((device_texture *)mem)->info;
-    }
-    else {
-      memset(&info, 0, sizeof(TextureInfo));
-    }
-
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-  }
-
-  /* Force write of descriptors. */
-  memory_manager.free(texture_info);
-  memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
-  flush_texture_buffers();
-
-  if (task.type == DeviceTask::RENDER) {
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    /* Allocate buffer for kernel globals */
-    device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    /* Keep rendering tiles until done. */
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        assert(tile.task == RenderTile::PATH_TRACE);
-        scoped_timer timer(&tile.buffers->render_time);
-
-        split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
-        /* Complete kernel execution before release tile. */
-        /* This helps in multi-device render;
-         * The device that reaches the critical-section function
-         * release_tile waits (stalling other devices from entering
-         * release_tile) for all kernels to complete. If device1 (a
-         * slow-render device) reaches release_tile first then it would
-         * stall device2 (a fast-render device) from proceeding to render
-         * next tile.
-         */
-        clFinish(cqCommandQueue);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        bake(task, tile);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-        denoise(tile, denoising);
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-    }
-
-    kgbuffer.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-  }
-  else if (task.type == DeviceTask::FILM_CONVERT) {
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
-                                device_ptr buffer,
-                                device_ptr rgba_byte,
-                                device_ptr rgba_half)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
-  cl_mem d_buffer = CL_MEM_PTR(buffer);
-  cl_int d_x = task.x;
-  cl_int d_y = task.y;
-  cl_int d_w = task.w;
-  cl_int d_h = task.h;
-  cl_float d_sample_scale = 1.0f / (task.sample + 1);
-  cl_int d_offset = task.offset;
-  cl_int d_stride = task.stride;
-
-  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
-                                                base_program(ustring("convert_to_half_float"));
-
-  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
-  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(ckFilmConvertKernel,
-                                     start_arg_index,
-                                     d_sample_scale,
-                                     d_x,
-                                     d_y,
-                                     d_w,
-                                     d_h,
-                                     d_offset,
-                                     d_stride);
-
-  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
-                                             device_ptr guide_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr out_ptr,
-                                             DenoisingTask *task)
-{
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  device_sub_ptr weightAccum(
-      task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
-  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem out_mem = CL_MEM_PTR(out_ptr);
-  cl_mem scale_mem = NULL;
-
-  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
-  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
-  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  guide_mem,
-                  variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  channel_offset,
-                  0,
-                  a,
-                  k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(ckNLMUpdateOutput,
-                  0,
-                  blurDifference_mem,
-                  image_mem,
-                  out_mem,
-                  weightAccum_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  channel_offset,
-                  r,
-                  f);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
-  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
-  enqueue_kernel(ckNLMNormalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
-  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterConstructTransform,
-                  arg_ofs,
-                  transform_mem,
-                  rank_mem,
-                  task->filter_area,
-                  task->rect,
-                  task->buffer.pass_stride,
-                  task->buffer.frame_stride,
-                  use_time,
-                  task->radius,
-                  task->pca_threshold);
-
-  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
-                                        device_ptr color_variance_ptr,
-                                        device_ptr scale_ptr,
-                                        int frame,
-                                        DenoisingTask *task)
-{
-  cl_mem color_mem = CL_MEM_PTR(color_ptr);
-  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  int r = task->radius;
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  color_mem,
-                  color_variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  pass_stride,
-                  frame_offset,
-                  1.0f,
-                  task->nlm_k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(ckNLMConstructGramian,
-                  0,
-                  t,
-                  blurDifference_mem,
-                  buffer_mem,
-                  transform_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->reconstruction_state.filter_window,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  4,
-                  frame_offset,
-                  use_time);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-
-  kernel_set_args(ckFinalize,
-                  0,
-                  output_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->filter_area,
-                  task->reconstruction_state.buffer_params,
-                  task->render_buffer.samples);
-  enqueue_kernel(ckFinalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
-                                            device_ptr b_ptr,
-                                            device_ptr mean_ptr,
-                                            device_ptr variance_ptr,
-                                            int r,
-                                            int4 rect,
-                                            DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
-  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
-  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
-                                           device_ptr b_ptr,
-                                           device_ptr sample_variance_ptr,
-                                           device_ptr sv_variance_ptr,
-                                           device_ptr buffer_variance_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
-  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
-  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
-  int arg_ofs = kernel_set_args(
-      ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterDivideShadow,
-                  arg_ofs,
-                  a_mem,
-                  b_mem,
-                  sample_variance_mem,
-                  sv_variance_mem,
-                  buffer_variance_mem,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
-                                         int variance_offset,
-                                         device_ptr mean_ptr,
-                                         device_ptr variance_ptr,
-                                         float scale,
-                                         DenoisingTask *task)
-{
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
-  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterGetFeature,
-                  arg_ofs,
-                  mean_offset,
-                  variance_offset,
-                  mean_mem,
-                  variance_mem,
-                  scale,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
-                                           device_ptr from_ptr,
-                                           device_ptr buffer_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem from_mem = CL_MEM_PTR(from_ptr);
-  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
-  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
-  kernel_set_args(ckFilterWriteFeature,
-                  0,
-                  task->render_buffer.samples,
-                  task->reconstruction_state.buffer_params,
-                  task->filter_area,
-                  from_mem,
-                  buffer_mem,
-                  out_offset,
-                  task->rect);
-  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr depth_ptr,
-                                             device_ptr output_ptr,
-                                             DenoisingTask *task)
-{
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
-  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
-  kernel_set_args(ckFilterDetectOutliers,
-                  0,
-                  image_mem,
-                  variance_mem,
-                  depth_mem,
-                  output_mem,
-                  task->rect,
-                  task->buffer.pass_stride);
-  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &OpenCLDevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_input = CL_MEM_PTR(task.shader_input);
-  cl_mem d_output = CL_MEM_PTR(task.shader_output);
-  cl_int d_shader_eval_type = task.shader_eval_type;
-  cl_int d_shader_filter = task.shader_filter;
-  cl_int d_shader_x = task.shader_x;
-  cl_int d_shader_w = task.shader_w;
-  cl_int d_offset = task.offset;
-
-  OpenCLDevice::OpenCLProgram *program = &background_program;
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    program = &displace_program;
-  }
-  program->wait_for_availability();
-  cl_kernel kernel = (*program)();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
-  }
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
-  for (int sample = 0; sample < task.num_samples; sample++) {
-
-    if (task.get_cancel())
-      break;
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, task.shader_w, 1);
-
-    clFinish(cqCommandQueue);
-
-    task.update_progress(NULL);
-  }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  /* Cast arguments to cl types. */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-  cl_int d_x = rtile.x;
-  cl_int d_y = rtile.y;
-  cl_int d_w = rtile.w;
-  cl_int d_h = rtile.h;
-  cl_int d_offset = rtile.offset;
-  cl_int d_stride = rtile.stride;
-
-  bake_program.wait_for_availability();
-  cl_kernel kernel = bake_program();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(
-      kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, d_w, d_h);
-    clFinish(cqCommandQueue);
-
-    rtile.sample = sample + 1;
-
-    task.update_progress(&rtile, rtile.w * rtile.h);
-  }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
-  /* Build with OpenCL 2.0 if available, this improves performance
-   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
-   * Note that OpenCL selects the highest 1.x version by default,
-   * only for 2.0 do we need the explicit compiler flag. */
-  int version_major, version_minor;
-  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
-    if (version_major >= 2) {
-      /* This appears to trigger a driver bug in Radeon RX cards with certain
-       * driver version, so don't use OpenCL 2.0 for those. */
-      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
-      if (string_startswith(device_name, "Radeon RX 4") ||
-          string_startswith(device_name, "Radeon (TM) RX 4") ||
-          string_startswith(device_name, "Radeon RX 5") ||
-          string_startswith(device_name, "Radeon (TM) RX 5")) {
-        char version[256] = "";
-        int driver_major, driver_minor;
-        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
-          return !(driver_major == 3075 && driver_minor <= 12);
-        }
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
-  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
-  if (kernel_build_opencl_2(cdDevice)) {
-    build_options += "-cl-std=CL2.0 ";
-  }
-
-  if (platform_name == "NVIDIA CUDA") {
-    build_options +=
-        "-D__KERNEL_OPENCL_NVIDIA__ "
-        "-cl-nv-maxrregcount=32 "
-        "-cl-nv-verbose ";
-
-    uint compute_capability_major, compute_capability_minor;
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_major,
-                    NULL);
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_minor,
-                    NULL);
-
-    build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-                                   compute_capability_major * 100 + compute_capability_minor * 10);
-  }
-
-  else if (platform_name == "Apple")
-    build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-  else if (platform_name == "AMD Accelerated Parallel Processing")
-    build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-  else if (platform_name == "Intel(R) OpenCL") {
-    build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-    /* Options for gdb source level kernel debugging.
-     * this segfaults on linux currently.
-     */
-    if (OpenCLInfo::use_debug() && debug_src)
-      build_options += "-g -s \"" + *debug_src + "\" ";
-  }
-
-  if (info.has_half_images) {
-    build_options += "-D__KERNEL_CL_KHR_FP16__ ";
-  }
-
-  if (OpenCLInfo::use_debug()) {
-    build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-  }
-
-#  ifdef WITH_NANOVDB
-  if (info.has_nanovdb) {
-    build_options += "-DWITH_NANOVDB ";
-  }
-#  endif
-
-  return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
-                                  int start_argument_index,
-                                  const ArgumentWrapper &arg1,
-                                  const ArgumentWrapper &arg2,
-                                  const ArgumentWrapper &arg3,
-                                  const ArgumentWrapper &arg4,
-                                  const ArgumentWrapper &arg5,
-                                  const ArgumentWrapper &arg6,
-                                  const ArgumentWrapper &arg7,
-                                  const ArgumentWrapper &arg8,
-                                  const ArgumentWrapper &arg9,
-                                  const ArgumentWrapper &arg10,
-                                  const ArgumentWrapper &arg11,
-                                  const ArgumentWrapper &arg12,
-                                  const ArgumentWrapper &arg13,
-                                  const ArgumentWrapper &arg14,
-                                  const ArgumentWrapper &arg15,
-                                  const ArgumentWrapper &arg16,
-                                  const ArgumentWrapper &arg17,
-                                  const ArgumentWrapper &arg18,
-                                  const ArgumentWrapper &arg19,
-                                  const ArgumentWrapper &arg20,
-                                  const ArgumentWrapper &arg21,
-                                  const ArgumentWrapper &arg22,
-                                  const ArgumentWrapper &arg23,
-                                  const ArgumentWrapper &arg24,
-                                  const ArgumentWrapper &arg25,
-                                  const ArgumentWrapper &arg26,
-                                  const ArgumentWrapper &arg27,
-                                  const ArgumentWrapper &arg28,
-                                  const ArgumentWrapper &arg29,
-                                  const ArgumentWrapper &arg30,
-                                  const ArgumentWrapper &arg31,
-                                  const ArgumentWrapper &arg32,
-                                  const ArgumentWrapper &arg33)
-{
-  int current_arg_index = 0;
-#  define FAKE_VARARG_HANDLE_ARG(arg) \
-    do { \
-      if (arg.pointer != NULL) { \
-        opencl_assert(clSetKernelArg( \
-            kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
-        ++current_arg_index; \
-      } \
-      else { \
-        return current_arg_index; \
-      } \
-    } while (false)
-  FAKE_VARARG_HANDLE_ARG(arg1);
-  FAKE_VARARG_HANDLE_ARG(arg2);
-  FAKE_VARARG_HANDLE_ARG(arg3);
-  FAKE_VARARG_HANDLE_ARG(arg4);
-  FAKE_VARARG_HANDLE_ARG(arg5);
-  FAKE_VARARG_HANDLE_ARG(arg6);
-  FAKE_VARARG_HANDLE_ARG(arg7);
-  FAKE_VARARG_HANDLE_ARG(arg8);
-  FAKE_VARARG_HANDLE_ARG(arg9);
-  FAKE_VARARG_HANDLE_ARG(arg10);
-  FAKE_VARARG_HANDLE_ARG(arg11);
-  FAKE_VARARG_HANDLE_ARG(arg12);
-  FAKE_VARARG_HANDLE_ARG(arg13);
-  FAKE_VARARG_HANDLE_ARG(arg14);
-  FAKE_VARARG_HANDLE_ARG(arg15);
-  FAKE_VARARG_HANDLE_ARG(arg16);
-  FAKE_VARARG_HANDLE_ARG(arg17);
-  FAKE_VARARG_HANDLE_ARG(arg18);
-  FAKE_VARARG_HANDLE_ARG(arg19);
-  FAKE_VARARG_HANDLE_ARG(arg20);
-  FAKE_VARARG_HANDLE_ARG(arg21);
-  FAKE_VARARG_HANDLE_ARG(arg22);
-  FAKE_VARARG_HANDLE_ARG(arg23);
-  FAKE_VARARG_HANDLE_ARG(arg24);
-  FAKE_VARARG_HANDLE_ARG(arg25);
-  FAKE_VARARG_HANDLE_ARG(arg26);
-  FAKE_VARARG_HANDLE_ARG(arg27);
-  FAKE_VARARG_HANDLE_ARG(arg28);
-  FAKE_VARARG_HANDLE_ARG(arg29);
-  FAKE_VARARG_HANDLE_ARG(arg30);
-  FAKE_VARARG_HANDLE_ARG(arg31);
-  FAKE_VARARG_HANDLE_ARG(arg32);
-  FAKE_VARARG_HANDLE_ARG(arg33);
-#  undef FAKE_VARARG_HANDLE_ARG
-  return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
-  if (kernel) {
-    clReleaseKernel(kernel);
-  }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
-  if (mem != NULL) {
-    clReleaseMemObject(mem);
-  }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
-  if (program) {
-    clReleaseProgram(program);
-  }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
-  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
-                                       ustring key,
-                                       thread_scoped_lock &cache_locker)
-{
-  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background)
-{
-  return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "util/util_foreach.h"
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
-  allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
-  bool need_realloc = false;
-
-  /* Calculate total size and remove any freed. */
-  size_t total_size = 0;
-
-  for (int i = allocations.size() - 1; i >= 0; i--) {
-    Allocation *allocation = allocations[i];
-
-    /* Remove allocations that have been freed. */
-    if (!allocation->mem || allocation->mem->memory_size() == 0) {
-      allocation->device_buffer = NULL;
-      allocation->size = 0;
-
-      allocations.erase(allocations.begin() + i);
-
-      need_realloc = true;
-
-      continue;
-    }
-
-    /* Get actual size for allocation. */
-    size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
-    if (allocation->size != alloc_size) {
-      /* Allocation is either new or resized. */
-      allocation->size = alloc_size;
-      allocation->needs_copy_to_device = true;
-
-      need_realloc = true;
-    }
-
-    total_size += alloc_size;
-  }
-
-  /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
-  total_size = std::max(total_size, (size_t)16);
-
-  if (need_realloc) {
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (total_size > max_buffer_size) {
-      device->set_error("Scene too complex to fit in available memory.");
-      return;
-    }
-
-    device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
-                                                                          "memory manager buffer");
-
-    new_buffer->alloc_to_device(total_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(new_buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-      else {
-        /* Fast copy from memory already on device. */
-        opencl_device_assert(device,
-                             clEnqueueCopyBuffer(device->cqCommandQueue,
-                                                 CL_MEM_PTR(buffer->device_pointer),
-                                                 CL_MEM_PTR(new_buffer->device_pointer),
-                                                 allocation->desc.offset,
-                                                 offset,
-                                                 allocation->mem->memory_size(),
-                                                 0,
-                                                 NULL,
-                                                 NULL));
-      }
-
-      allocation->desc.offset = offset;
-      offset += allocation->size;
-    }
-
-    delete buffer;
-
-    buffer = new_buffer;
-  }
-  else {
-    assert(total_size == buffer->data_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-
-      offset += allocation->size;
-    }
-  }
-
-  /* Not really necessary, but seems to improve responsiveness for some reason. */
-  clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
-  buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
-  DeviceBuffer *smallest = device_buffers;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.size < smallest->size) {
-      smallest = &device_buffer;
-    }
-  }
-
-  return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
-  }
-}
-
-void MemoryManager::free()
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.free(device);
-  }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
-  Allocation &allocation = allocations[name];
-
-  allocation.mem = &mem;
-  allocation.needs_copy_to_device = true;
-
-  if (!allocation.device_buffer) {
-    DeviceBuffer *device_buffer = smallest_device_buffer();
-    allocation.device_buffer = device_buffer;
-
-    allocation.desc.device_buffer = device_buffer - device_buffers;
-
-    device_buffer->add_allocation(allocation);
-
-    device_buffer->size += mem.memory_size();
-  }
-
-  need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
-  foreach (AllocationsMap::value_type &value, allocations) {
-    Allocation &allocation = value.second;
-    if (allocation.mem == &mem) {
-
-      allocation.device_buffer->size -= mem.memory_size();
-
-      allocation.mem = NULL;
-      allocation.needs_copy_to_device = false;
-
-      need_update = true;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
-  update_device_memory();
-
-  Allocation &allocation = allocations[name];
-  return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
-  if (!need_update) {
-    return;
-  }
-
-  need_update = false;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.update_device_memory(device);
-  }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  update_device_memory();
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.buffer->device_pointer) {
-      device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
-    }
-    else {
-      device->kernel_set_args(kernel, (*narg)++);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
-  static const int NUM_DEVICE_BUFFERS = 8;
-
-  struct BufferDescriptor {
-    uint device_buffer;
-    cl_ulong offset;
-  };
-
- private:
-  struct DeviceBuffer;
-
-  struct Allocation {
-    device_memory *mem;
-
-    DeviceBuffer *device_buffer;
-    size_t size; /* Size of actual allocation, may be larger than requested. */
-
-    BufferDescriptor desc;
-
-    bool needs_copy_to_device;
-
-    Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
-    {
-    }
-  };
-
-  struct DeviceBuffer {
-    device_only_memory<uchar> *buffer;
-    vector<Allocation *> allocations;
-    size_t size; /* Size of all allocations. */
-
-    DeviceBuffer() : buffer(NULL), size(0)
-    {
-    }
-
-    ~DeviceBuffer()
-    {
-      delete buffer;
-      buffer = NULL;
-    }
-
-    void add_allocation(Allocation &allocation);
-
-    void update_device_memory(OpenCLDevice *device);
-
-    void free(OpenCLDevice *device);
-  };
-
-  OpenCLDevice *device;
-
-  DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
-  typedef unordered_map<string, Allocation> AllocationsMap;
-  AllocationsMap allocations;
-
-  bool need_update;
-
-  DeviceBuffer *smallest_device_buffer();
-
- public:
-  MemoryManager(OpenCLDevice *device);
-
-  void free(); /* Free all memory. */
-
-  void alloc(const char *name, device_memory &mem);
-  bool free(device_memory &mem);
-
-  BufferDescriptor get_descriptor(string name);
-
-  void update_device_memory();
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device_intern.h"
-#  include "device/opencl/device_opencl.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_semaphore.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
-    : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
-  delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
-    : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
-  delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
-  static OpenCLCache instance;
-  return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
-                                    cl_device_id device,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!slot.context_mutex)
-    slot.context_mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*slot.context_mutex);
-
-  /* If the thing isn't cached */
-  if (slot.context == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainContext(slot.context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
-                                    cl_device_id device,
-                                    ustring key,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
-      Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
-  Slot::ProgramEntry &entry = ins2.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!entry.mutex)
-    entry.mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*entry.mutex);
-
-  /* If the thing isn't cached */
-  if (entry.program == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainProgram(entry.program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_context context,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(context != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  cache_lock.unlock();
-
-  Slot &slot = i->second;
-
-  /* sanity check */
-  assert(i != self.cache.end());
-  assert(slot.context == NULL);
-
-  slot.context = context;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* increment reference count in OpenCL.
-   * The caller is going to release the object when done with it. */
-  cl_int ciErr = clRetainContext(context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_program program,
-                                ustring key,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(program != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  assert(i != self.cache.end());
-  Slot &slot = i->second;
-
-  Slot::EntryMap::iterator i2 = slot.programs.find(key);
-  assert(i2 != slot.programs.end());
-  Slot::ProgramEntry &entry = i2->second;
-
-  assert(entry.program == NULL);
-
-  cache_lock.unlock();
-
-  entry.program = program;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* Increment reference count in OpenCL.
-   * The caller is going to release the object when done with it.
-   */
-  cl_int ciErr = clRetainProgram(program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
-  OpenCLCache &self = global_instance();
-  thread_scoped_lock lock(self.kernel_md5_lock);
-
-  if (self.kernel_md5.empty()) {
-    self.kernel_md5 = path_files_md5_hash(path_get("source"));
-  }
-  return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
-  string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-  /* We compile kernels consisting of many files. unfortunately OpenCL
-   * kernel caches do not seem to recognize changes in included files.
-   * so we force recompile on changes by adding the md5 hash of all files.
-   */
-  source = path_source_replace_includes(source, path_get("source"));
-  source += "\n// " + util_md5_string(source) + "\n";
-  return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
-                                           const string &program_name,
-                                           const string &kernel_file,
-                                           const string &kernel_build_options,
-                                           bool use_stdout)
-    : device(device),
-      program_name(program_name),
-      kernel_file(kernel_file),
-      kernel_build_options(kernel_build_options),
-      use_stdout(use_stdout)
-{
-  loaded = false;
-  needs_compiling = true;
-  program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
-  release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    if (kernel->second) {
-      clReleaseKernel(kernel->second);
-      kernel->second = NULL;
-    }
-  }
-  if (program) {
-    clReleaseProgram(program);
-    program = NULL;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
-  if (!use_stdout) {
-    log += msg + "\n";
-  }
-  else if (!debug) {
-    printf("%s\n", msg.c_str());
-    fflush(stdout);
-  }
-  else {
-    VLOG(2) << msg;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
-  if (use_stdout) {
-    fprintf(stderr, "%s\n", msg.c_str());
-  }
-  if (error_msg == "") {
-    error_msg += "\n";
-  }
-  error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
-  if (!kernels.count(name)) {
-    kernels[name] = NULL;
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
-  string build_options;
-  build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
-  VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
-  cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-  /* show warnings even if build is successful */
-  size_t ret_val_size = 0;
-
-  clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
-              ", errors in console.");
-  }
-
-  if (ret_val_size > 1) {
-    vector<char> build_log(ret_val_size + 1);
-    clGetProgramBuildInfo(
-        program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
-    build_log[ret_val_size] = '\0';
-    /* Skip meaningless empty output from the NVidia compiler. */
-    if (!(ret_val_size == 2 && build_log[0] == '\n')) {
-      add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
-              ciErr == CL_SUCCESS);
-    }
-  }
-
-  return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
-  string source = get_program_source(kernel_file);
-
-  if (debug_src) {
-    path_write_text(*debug_src, source);
-  }
-
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_int ciErr;
-
-  program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
-    return false;
-  }
-
-  double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  double elapsed = time_dt() - starttime;
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return true;
-}
-
-static void escape_python_string(string &str)
-{
-  /* Escape string to be passed as a Python raw string with '' quotes'. */
-  string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
-  /* Limit number of concurrent processes compiling, with a heuristic based
-   * on total physical RAM and estimate of memory usage needed when compiling
-   * with all Cycles features enabled.
-   *
-   * This is somewhat arbitrary as we don't know the actual available RAM or
-   * how much the kernel compilation will needed depending on the features, but
-   * better than not limiting at all. */
-  static const int64_t GB = 1024LL * 1024LL * 1024LL;
-  static const int64_t process_memory = 2 * GB;
-  static const int64_t base_memory = 2 * GB;
-  static const int64_t system_memory = system_physical_ram();
-  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
-  return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
-  /* Construct arguments. */
-  vector<string> args;
-  args.push_back("--background");
-  args.push_back("--factory-startup");
-  args.push_back("--python-expr");
-
-  int device_platform_id = device->device_num;
-  string device_name = device->device_name;
-  string platform_name = device->platform_name;
-  string build_options = device->kernel_build_options(NULL) + kernel_build_options;
-  string kernel_file_escaped = kernel_file;
-  string clbin_escaped = clbin;
-
-  escape_python_string(device_name);
-  escape_python_string(platform_name);
-  escape_python_string(build_options);
-  escape_python_string(kernel_file_escaped);
-  escape_python_string(clbin_escaped);
-
-  args.push_back(string_printf(
-      "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
-      device_platform_id,
-      device_name.c_str(),
-      platform_name.c_str(),
-      build_options.c_str(),
-      kernel_file_escaped.c_str(),
-      clbin_escaped.c_str()));
-
-  /* Limit number of concurrent processes compiling. */
-  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
-  semaphore.acquire();
-
-  /* Compile. */
-  const double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-  const bool success = system_call_self(args);
-  const double elapsed = time_dt() - starttime;
-
-  semaphore.release();
-
-  if (!success || !path_exists(clbin)) {
-    return false;
-  }
-
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
-  int device_platform_id = std::stoi(parameters[0]);
-  const string &device_name = parameters[1];
-  const string &platform_name = parameters[2];
-  const string &build_options = parameters[3];
-  const string &kernel_file = parameters[4];
-  const string &binary_path = parameters[5];
-
-  if (clewInit() != CLEW_SUCCESS) {
-    return false;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (device_platform_id >= usable_devices.size()) {
-    return false;
-  }
-
-  OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
-  if (platform_device.platform_name != platform_name ||
-      platform_device.device_name != device_name) {
-    return false;
-  }
-
-  cl_platform_id platform = platform_device.platform_id;
-  cl_device_id device = platform_device.device_id;
-  const cl_context_properties context_props[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
-  cl_int err;
-  cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
-  if (err != CL_SUCCESS) {
-    return false;
-  }
-
-  string source = get_program_source(kernel_file);
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
-  bool result = false;
-
-  if (err == CL_SUCCESS) {
-    err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-    if (err == CL_SUCCESS) {
-      size_t size = 0;
-      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-      if (size > 0) {
-        vector<uint8_t> binary(size);
-        uint8_t *bytes = &binary[0];
-        clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-        result = path_write_binary(binary_path, binary);
-      }
-    }
-    clReleaseProgram(program);
-  }
-
-  clReleaseContext(context);
-
-  return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
-  /* read binary into memory */
-  vector<uint8_t> binary;
-
-  if (!path_read_binary(clbin, binary)) {
-    add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-    return false;
-  }
-
-  /* create program */
-  cl_int status, ciErr;
-  size_t size = binary.size();
-  const uint8_t *bytes = &binary[0];
-
-  program = clCreateProgramWithBinary(
-      device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
-  if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
-              clewErrorString(status) + " " + clewErrorString(ciErr));
-    return false;
-  }
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
-  size_t size = 0;
-  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
-  if (!size)
-    return false;
-
-  vector<uint8_t> binary(size);
-  uint8_t *bytes = &binary[0];
-
-  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
-  return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
-  loaded = false;
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-  if (!program) {
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* If binary kernel exists already, try use it. */
-    if (path_exists(clbin) && load_binary(clbin)) {
-      /* Kernel loaded from binary, nothing to do. */
-      add_log(string("Loaded program from ") + clbin + ".", true);
-
-      /* Cache the program. */
-      device->store_cached_kernel(program, cache_key, cache_locker);
-    }
-    else {
-      add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
-      cache_locker.unlock();
-    }
-  }
-
-  if (program) {
-    create_kernels();
-    loaded = true;
-    needs_compiling = false;
-  }
-
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
-  assert(device);
-
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-
-  if (!program) {
-
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* path to preprocessed source for debugging */
-    string clsrc, *debug_src = NULL;
-
-    if (OpenCLInfo::use_debug()) {
-      clsrc = basename + ".cl";
-      debug_src = &clsrc;
-    }
-
-    if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
-      add_log(string("Built and loaded program from ") + clbin + ".", true);
-      loaded = true;
-    }
-    else {
-      if (DebugFlags().running_inside_blender) {
-        add_log(string("Separate-process building of ") + clbin +
-                    " failed, will fall back to regular building.",
-                true);
-      }
-
-      /* If does not exist or loading binary failed, compile kernel. */
-      if (!compile_kernel(debug_src)) {
-        needs_compiling = false;
-        return;
-      }
-
-      /* Save binary for reuse. */
-      if (!save_binary(clbin)) {
-        add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-      }
-    }
-
-    /* Cache the program. */
-    device->store_cached_kernel(program, cache_key, cache_locker);
-  }
-
-  create_kernels();
-  needs_compiling = false;
-  loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    assert(kernel->second == NULL);
-    cl_int ciErr;
-    string name = "kernel_ocl_" + kernel->first.string();
-    kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
-    if (device->opencl_error(ciErr)) {
-      add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
-                clewErrorString(ciErr));
-      return;
-    }
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
-  add_log(string("Waiting for availability of ") + program_name + ".", true);
-  while (needs_compiling) {
-    time_sleep(0.1);
-  }
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
-  /* If loaded is true, there was no error. */
-  if (loaded)
-    return;
-  /* if use_stdout is true, the error was already reported. */
-  if (use_stdout)
-    return;
-
-  cerr << error_msg << endl;
-  if (!compile_output.empty()) {
-    cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
-    cerr << compile_output << endl;
-  }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
-  assert(kernels.size() == 1);
-  return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
-  assert(kernels.count(name));
-  return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
-  switch (DebugFlags().opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      return 0;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      return CL_DEVICE_TYPE_ALL;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      return CL_DEVICE_TYPE_DEFAULT;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      return CL_DEVICE_TYPE_CPU;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      return CL_DEVICE_TYPE_GPU;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      return CL_DEVICE_TYPE_ACCELERATOR;
-    default:
-      return CL_DEVICE_TYPE_ALL;
-  }
-}
-
-bool OpenCLInfo::use_debug()
-{
-  return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return false;
-  }
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return false;
-  }
-
-  int driver_major = 0;
-  int driver_minor = 0;
-  if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
-    return false;
-  }
-  VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
-  if (getenv("CYCLES_OPENCL_TEST")) {
-    return true;
-  }
-
-  /* Allow Intel GPUs on Intel OpenCL platform. */
-  if (platform_name.find("Intel") != string::npos) {
-    if (device_type != CL_DEVICE_TYPE_GPU) {
-      /* OpenCL on Intel CPU is not an officially supported configuration.
-       * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
-      return false;
-    }
-
-#  ifdef __APPLE__
-    /* Apple uses own framework, which can also put Iris onto AMD frame-work.
-     * This isn't supported configuration. */
-    return false;
-#  else
-    if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
-      return true;
-    }
-#  endif
-  }
-
-  if (platform_name == "AMD Accelerated Parallel Processing" &&
-      device_type == CL_DEVICE_TYPE_GPU) {
-    if (driver_major < 2236) {
-      VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
-      return false;
-    }
-    const char *blacklist[] = {/* GCN 1 */
-                               "Tahiti",
-                               "Pitcairn",
-                               "Capeverde",
-                               "Oland",
-                               "Hainan",
-                               NULL};
-    for (int i = 0; blacklist[i] != NULL; i++) {
-      if (device_name == blacklist[i]) {
-        VLOG(1) << "AMD device " << device_name << " not supported";
-        return false;
-      }
-    }
-    return true;
-  }
-  if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-    return false;
-  }
-  return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  char version[256];
-  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-    }
-    return false;
-  }
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf(
-          "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
-  char version[256];
-  clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  if (!get_device_version(device, &major, &minor, error)) {
-    return false;
-  }
-
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
-  if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
-    /* Use cl_amd_device_topology extension. */
-    cl_char topology[24];
-    if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
-        topology[0] == 1) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)topology[21],
-                           (unsigned int)topology[22],
-                           (unsigned int)topology[23]);
-    }
-  }
-  else if (platform_name == "NVIDIA CUDA") {
-    /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
-    cl_int bus_id, slot_id;
-    if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
-        clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)(bus_id),
-                           (unsigned int)(slot_id >> 3),
-                           (unsigned int)(slot_id & 0x7));
-    }
-  }
-  /* No general way to get a hardware ID from OpenCL => give up. */
-  return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
-  const cl_device_type device_type = OpenCLInfo::device_type();
-  static bool first_time = true;
-#  define FIRST_VLOG(severity) \
-    if (first_time) \
-    VLOG(severity)
-
-  usable_devices->clear();
-
-  if (device_type == 0) {
-    FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-    first_time = false;
-    return;
-  }
-
-  cl_int error;
-  vector<cl_device_id> device_ids;
-  vector<cl_platform_id> platform_ids;
-
-  /* Get platforms. */
-  if (!get_platforms(&platform_ids, &error)) {
-    FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
-    first_time = false;
-    return;
-  }
-  if (platform_ids.size() == 0) {
-    FIRST_VLOG(2) << "No OpenCL platforms were found.";
-    first_time = false;
-    return;
-  }
-  /* Devices are numbered consecutively across platforms. */
-  for (int platform = 0; platform < platform_ids.size(); platform++) {
-    cl_platform_id platform_id = platform_ids[platform];
-    string platform_name;
-    if (!get_platform_name(platform_id, &platform_name)) {
-      FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-      continue;
-    }
-    FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
-    if (!platform_version_check(platform_id)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << " due to too old compiler version.";
-      continue;
-    }
-    if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << ", failed to fetch of devices: " << string(clewErrorString(error));
-      continue;
-    }
-    if (device_ids.size() == 0) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
-      continue;
-    }
-    for (int num = 0; num < device_ids.size(); num++) {
-      const cl_device_id device_id = device_ids[num];
-      string device_name;
-      if (!get_device_name(device_id, &device_name, &error)) {
-        FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
-                      << ", ignoring.";
-        continue;
-      }
-      if (!device_version_check(device_id)) {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
-        continue;
-      }
-      if (device_supported(platform_name, device_id)) {
-        cl_device_type device_type;
-        if (!get_device_type(device_id, &device_type, &error)) {
-          FIRST_VLOG(2) << "Ignoring device " << device_name
-                        << ", failed to fetch device type:" << string(clewErrorString(error));
-          continue;
-        }
-        string readable_device_name = get_readable_device_name(device_id);
-        if (readable_device_name != device_name) {
-          FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
-        }
-        FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
-        string hardware_id = get_hardware_id(platform_name, device_id);
-        string device_extensions = get_device_extensions(device_id);
-        usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-                                                       platform_name,
-                                                       device_id,
-                                                       device_type,
-                                                       readable_device_name,
-                                                       hardware_id,
-                                                       device_extensions));
-      }
-      else {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
-      }
-    }
-  }
-  first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
-  /* Reset from possible previous state. */
-  platform_ids->resize(0);
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms, error)) {
-    return false;
-  }
-  /* Get actual platforms. */
-  cl_int err;
-  platform_ids->resize(num_platforms);
-  if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
-  vector<cl_platform_id> platform_ids;
-  get_platforms(&platform_ids);
-  return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_platforms = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms)) {
-    return 0;
-  }
-  return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
-  char buffer[256];
-  if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
-      CL_SUCCESS) {
-    *platform_name = "";
-    return false;
-  }
-  *platform_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
-  string platform_name;
-  if (!get_platform_name(platform_id, &platform_name)) {
-    return "";
-  }
-  return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                          cl_device_type device_type,
-                                          cl_uint *num_devices,
-                                          cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_devices = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                             cl_device_type device_type)
-{
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
-    return 0;
-  }
-  return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                      cl_device_type device_type,
-                                      vector<cl_device_id> *device_ids,
-                                      cl_int *error)
-{
-  /* Reset from possible previous state. */
-  device_ids->resize(0);
-  /* Get number of devices to pre-allocate memory. */
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
-    return false;
-  }
-  /* Get actual device list. */
-  device_ids->resize(num_devices);
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                                      cl_device_type device_type)
-{
-  vector<cl_device_id> devices;
-  get_platform_devices(platform_id, device_type, &devices);
-  return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_name = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return "";
-  }
-  return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
-                                       string *device_extensions,
-                                       cl_int *error)
-{
-  size_t extension_length = 0;
-  cl_int err;
-  /* Determine the size of the extension string. */
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  vector<char> buffer(extension_length);
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_extensions = string(buffer.data());
-  return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
-  string device_extensions;
-  if (!get_device_extensions(device_id, &device_extensions)) {
-    return "";
-  }
-  return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
-                                 cl_device_type *device_type,
-                                 cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_type = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return 0;
-  }
-  return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
-  string name = "";
-  char board_name[1024];
-  size_t length = 0;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
-      CL_SUCCESS) {
-    if (length != 0 && board_name[0] != '\0') {
-      name = board_name;
-    }
-  }
-
-  /* Fallback to standard device name API. */
-  if (name.empty()) {
-    name = get_device_name(device_id);
-  }
-
-  /* Special exception for AMD Vega, need to be able to tell
-   * Vega 56 from 64 apart.
-   */
-  if (name == "Radeon RX Vega") {
-    cl_int max_compute_units = 0;
-    if (clGetDeviceInfo(device_id,
-                        CL_DEVICE_MAX_COMPUTE_UNITS,
-                        sizeof(max_compute_units),
-                        &max_compute_units,
-                        NULL) == CL_SUCCESS) {
-      name += " " + to_string(max_compute_units);
-    }
-  }
-
-  /* Distinguish from our native CPU device. */
-  if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
-    name += " (OpenCL)";
-  }
-
-  return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  if (sscanf(buffer, "%d.%d", major, minor) < 2) {
-    VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
-    return false;
-  }
-  return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
-  int base_align_bits;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
-      CL_SUCCESS) {
-    return base_align_bits / 8;
-  }
-  return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+#  include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+    /* Already initialized function table. */
+    return true;
+  }
+
+  /* Need to initialize CUDA as well. */
+  if (!device_cuda_init()) {
+    return false;
+  }
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+               "Please update to the latest driver first!";
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  /* Loaded OptiX successfully! */
+  return true;
+#else
+  return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+  devices.reserve(cuda_devices.size());
+
+  /* Simply add all supported CUDA devices as OptiX devices again. */
+  for (DeviceInfo info : cuda_devices) {
+    assert(info.type == DEVICE_CUDA);
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+    if (major < 5) {
+      /* Only Maxwell and up are supported by OptiX. */
+      continue;
+    }
+
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
+
+    devices.push_back(info);
+  }
+#else
+  (void)cuda_devices;
+  (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+  return new OptiXDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..cd16b8c9f01
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/device_impl.h"
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/pass.h"
+#  include "render/scene.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_progress.h"
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+    : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : CUDADevice(info, stats, profiler),
+      sbt_data(this, "__sbt", MEM_READ_ONLY),
+      launch_params(this, "__params"),
+      denoiser_(this)
+{
+  /* Make the CUDA context current. */
+  if (!cuContext) {
+    /* Do not initialize if CUDA context creation failed already. */
+    return;
+  }
+  const CUDAContextScope scope(this);
+
+  /* Create OptiX context for this device. */
+  OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+  options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+  options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+    switch (level) {
+      case 1:
+        LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+        break;
+      case 2:
+        LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+        break;
+      case 3:
+        LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+        break;
+      case 4:
+        LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+        break;
+    }
+  };
+#  endif
+  if (DebugFlags().optix.use_debug) {
+    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+  }
+  optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+  optix_assert(optixDeviceContextSetLogCallback(
+      context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+  /* Fix weird compiler bug that assigns wrong size. */
+  launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+  /* Allocate launch parameter buffer memory on device. */
+  launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+  /* Make CUDA context current. */
+  const CUDAContextScope scope(this);
+
+  free_bvh_memory_delayed();
+
+  sbt_data.free();
+  texture_info.free();
+  launch_params.free();
+
+  /* Unload modules. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+    }
+  }
+
+  optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+  return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+  /* OptiX has its own internal acceleration structure format. */
+  return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+  /* Add OptiX SDK include directory to include paths. */
+  const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+  if (optix_sdk_path) {
+    common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+  }
+
+  /* Specialization for shader raytracing. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    common_cflags += " --keep-device-functions";
+  }
+
+  return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+  if (have_error()) {
+    /* Abort early if context creation failed already. */
+    return false;
+  }
+
+  /* Load CUDA modules because we need some of the utility kernels. */
+  if (!CUDADevice::load_kernels(kernel_features)) {
+    return false;
+  }
+
+  /* Skip creating OptiX module if only doing denoising. */
+  if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+    return true;
+  }
+
+  const CUDAContextScope scope(this);
+
+  /* Unload existing OptiX module and pipelines first. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+    optix_module = NULL;
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+      builtin_modules[i] = NULL;
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+      pipelines[i] = NULL;
+    }
+  }
+
+  OptixModuleCompileOptions module_options = {};
+  module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+  if (DebugFlags().optix.use_debug) {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  module_options.boundValues = nullptr;
+  module_options.numBoundValues = 0;
+
+  OptixPipelineCompileOptions pipeline_options = {};
+  /* Default to no motion blur and two-level graph, since it is the fastest option. */
+  pipeline_options.usesMotionBlur = false;
+  pipeline_options.traversableGraphFlags =
+      OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+  pipeline_options.numPayloadValues = 6;
+  pipeline_options.numAttributeValues = 2; /* u, v */
+  pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+  pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+  pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+    }
+    else
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }
+
+  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+   * This is necessary since objects may be reported to have motion if the Vector pass is
+   * active, but may still need to be rendered without motion blur if that isn't active as well. */
+  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+  if (motion_blur) {
+    pipeline_options.usesMotionBlur = true;
+    /* Motion blur can insert motion transforms into the traversal graph.
+     * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+    pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+  }
+
+  { /* Load and compile PTX module with OptiX kernels. */
+    string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                                 "lib/kernel_optix_shader_raytrace.ptx" :
+                                                 "lib/kernel_optix.ptx");
+    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+      if (!getenv("OPTIX_ROOT_DIR")) {
+        set_error(
+            "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+            "the Optix SDK to be able to compile Optix kernels on demand).");
+        return false;
+      }
+      ptx_filename = compile_kernel(
+          kernel_features,
+          (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+          "optix",
+          true);
+    }
+    if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &optix_module);
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+  }
+
+  /* Create program groups. */
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_closest";
+  group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_shadow";
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_subsurface";
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_volume_stack";
+  group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+  group_descs[PG_MISS].miss.module = optix_module;
+  group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+  group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      /* Built-in thick curve intersection. */
+      OptixBuiltinISOptions builtin_options = {};
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+      builtin_options.usesMotionBlur = false;
+
+      optix_assert(optixBuiltinISModuleGet(
+          context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+      group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+      group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+      if (motion_blur) {
+        builtin_options.usesMotionBlur = true;
+
+        optix_assert(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+        group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+        group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+        group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+      }
+    }
+    else {
+      /* Custom ribbon intersection. */
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+    }
+  }
+
+  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+    /* Add hit group for local intersections. */
+    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+  }
+
+  /* Shader raytracing replaces some functions with direct callables. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+        "__direct_callable__svm_node_bevel";
+    group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+  }
+
+  optix_assert(optixProgramGroupCreate(
+      context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+  /* Get program stack sizes. */
+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+  /* Set up SBT, which in this case is used only to select between different programs. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS);
+  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+  }
+  sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+  /* Calculate maximum trace continuation stack size. */
+  unsigned int trace_css = stack_size[PG_HITD].cssCH;
+  /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 1;
+
+  if (DebugFlags().optix.use_debug) {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    /* Create shader raytracing pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE_RAYTRACE]));
+
+    /* Combine ray generation and trace continuation stack size. */
+    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+                             link_options.maxTraceDepth * trace_css;
+    const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+                                      stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+    /* Set stack size depending on pipeline options. */
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+  }
+
+  { /* Create intersection-only pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_INTERSECT]));
+
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
+
+    optix_assert(
+        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+  }
+
+  /* Clean up program group objects. */
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optixProgramGroupDestroy(groups[i]);
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer"),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    const int num_guiding_passes = num_input_passes - 1;
+
+    if (num_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxillary passes. */
+  int num_input_passes = 0;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+  const CUDAContextScope scope(this);
+
+  DenoiseContext context(this, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+  return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&context.guiding_params.pass_normal),
+                  &context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&context.pass_denoising_albedo),
+                  const_cast<int *>(&context.pass_denoising_normal),
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&context.num_samples)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (!context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&pass.denoised_offset)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+                                                   const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.num_samples),
+                  const_cast<int *>(&pass.noisy_offset),
+                  const_cast<int *>(&pass.denoised_offset),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&pass.num_components),
+                  const_cast<bool *>(&pass.use_compositing)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+                                 (denoiser_.use_pass_normal != context.use_pass_normal);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (denoiser_.optix_denoiser) {
+    optixDenoiserDestroy(denoiser_.optix_denoiser);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+  const OptixResult result = optixDenoiserCreate(
+      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  denoiser_.use_pass_albedo = context.use_pass_albedo;
+  denoiser_.use_pass_normal = context.use_pass_normal;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  denoiser_.is_configured = false;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+  if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+                                  denoiser_.configured_size.y == context.buffer_params.height)) {
+    return true;
+  }
+
+  const BufferParams &buffer_params = context.buffer_params;
+
+  OptixDenoiserSizes sizes = {};
+  optix_assert(optixDenoiserComputeMemoryResources(
+      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+  denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  denoiser_.is_configured = true;
+  denoiser_.configured_size.x = buffer_params.width;
+  denoiser_.configured_size.y = buffer_params.height;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+
+  OptixImage2D output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  /* Finally run denonising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.output = output_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+
+  optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+                                   denoiser_.queue.stream(),
+                                   &params,
+                                   denoiser_.state.device_pointer,
+                                   denoiser_.scratch_offset,
+                                   &guide_layers,
+                                   &image_layers,
+                                   1,
+                                   0,
+                                   0,
+                                   denoiser_.state.device_pointer + denoiser_.scratch_offset,
+                                   denoiser_.scratch_size));
+
+  return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+                                  OptixBuildOperation operation,
+                                  const OptixBuildInput &build_input,
+                                  uint16_t num_motion_steps)
+{
+  const CUDAContextScope scope(this);
+
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  /* Compute memory usage. */
+  OptixAccelBufferSizes sizes = {};
+  OptixAccelBuildOptions options = {};
+  options.operation = operation;
+  if (use_fast_trace_bvh) {
+    VLOG(2) << "Using fast to trace OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+  }
+  else {
+    VLOG(2) << "Using fast to update OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+  }
+
+  options.motionOptions.numKeys = num_motion_steps;
+  options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+  options.motionOptions.timeBegin = 0.0f;
+  options.motionOptions.timeEnd = 1.0f;
+
+  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+  /* Allocate required output buffers. */
+  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+  if (!temp_mem.device_pointer) {
+    /* Make sure temporary memory allocation succeeded. */
+    return false;
+  }
+
+  device_only_memory<char> &out_data = bvh->as_data;
+  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+    assert(out_data.device == this);
+    out_data.alloc_to_device(sizes.outputSizeInBytes);
+    if (!out_data.device_pointer) {
+      return false;
+    }
+  }
+  else {
+    assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+  }
+
+  /* Finally build the acceleration structure. */
+  OptixAccelEmitDesc compacted_size_prop = {};
+  compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+  /* A tiny space was allocated for this property at the end of the temporary buffer above.
+   * Make sure this pointer is 8-byte aligned. */
+  compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+  OptixTraversableHandle out_handle = 0;
+  optix_assert(optixAccelBuild(context,
+                               NULL,
+                               &options,
+                               &build_input,
+                               1,
+                               temp_mem.device_pointer,
+                               sizes.tempSizeInBytes,
+                               out_data.device_pointer,
+                               sizes.outputSizeInBytes,
+                               &out_handle,
+                               use_fast_trace_bvh ? &compacted_size_prop : NULL,
+                               use_fast_trace_bvh ? 1 : 0));
+  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+  /* Wait for all operations to finish. */
+  cuda_assert(cuStreamSynchronize(NULL));
+
+  /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+   */
+  if (use_fast_trace_bvh) {
+    uint64_t compacted_size = sizes.outputSizeInBytes;
+    cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+    /* Temporary memory is no longer needed, so free it now to make space. */
+    temp_mem.free();
+
+    /* There is no point compacting if the size does not change. */
+    if (compacted_size < sizes.outputSizeInBytes) {
+      device_only_memory<char> compacted_data(this, "optix compacted as");
+      compacted_data.alloc_to_device(compacted_size);
+      if (!compacted_data.device_pointer)
+        /* Do not compact if memory allocation for compacted acceleration structure fails.
+         * Can just use the uncompacted one then, so succeed here regardless. */
+        return !have_error();
+
+      optix_assert(optixAccelCompact(
+          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+      bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+      /* Wait for compaction to finish. */
+      cuda_assert(cuStreamSynchronize(NULL));
+
+      std::swap(out_data.device_size, compacted_data.device_size);
+      std::swap(out_data.device_pointer, compacted_data.device_pointer);
+    }
+  }
+
+  return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  free_bvh_memory_delayed();
+
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  progress.set_substatus("Building OptiX acceleration structure");
+
+  if (!bvh->params.top_level) {
+    assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+    /* Refit is only possible in viewport for now (because AS is built with
+     * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+    OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+    if (refit && !use_fast_trace_bvh) {
+      assert(bvh_optix->traversable_handle != 0);
+      operation = OPTIX_BUILD_OPERATION_UPDATE;
+    }
+    else {
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+    }
+
+    /* Build bottom level acceleration structures (BLAS). */
+    Geometry *const geom = bvh->geometry[0];
+    if (geom->geometry_type == Geometry::HAIR) {
+      /* Build BLAS for curve primitives. */
+      Hair *const hair = static_cast<Hair *const>(geom);
+      if (hair->num_curves() == 0) {
+        return;
+      }
+
+      const size_t num_segments = hair->num_segments();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = hair->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      /* Four control points for each curve segment. */
+      const size_t num_vertices = num_segments * 4;
+      if (hair->curve_shape == CURVE_THICK) {
+        index_data.alloc(num_segments);
+        vertex_data.alloc(num_vertices * num_motion_steps);
+      }
+      else
+        aabb_data.alloc(num_segments * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *keys = hair->get_curve_keys().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+        }
+
+        for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+          const Hair::Curve curve = hair->get_curve(j);
+          const array<float> &curve_radius = hair->get_curve_radius();
+
+          for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+            if (hair->curve_shape == CURVE_THICK) {
+              int k0 = curve.first_key + segment;
+              int k1 = k0 + 1;
+              int ka = max(k0 - 1, curve.first_key);
+              int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+              const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+              const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+              const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+              const float4 pw = make_float4(
+                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+              /* Convert Catmull-Rom data to Bezier spline. */
+              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+              static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+              v[0] = make_float4(
+                  dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+              v[1] = make_float4(
+                  dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+              v[2] = make_float4(
+                  dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+              v[3] = make_float4(
+                  dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+            }
+            else {
+              BoundBox bounds = BoundBox::empty;
+              curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+              const size_t index = step * num_segments + i;
+              aabb_data[index].minX = bounds.min.x;
+              aabb_data[index].minY = bounds.min.y;
+              aabb_data[index].minZ = bounds.min.z;
+              aabb_data[index].maxX = bounds.max.x;
+              aabb_data[index].maxY = bounds.max.y;
+              aabb_data[index].maxZ = bounds.max.z;
+            }
+          }
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      vector<device_ptr> width_ptrs;
+      vector<device_ptr> vertex_ptrs;
+      width_ptrs.reserve(num_motion_steps);
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+        const device_ptr base_ptr = vertex_data.device_pointer +
+                                    step * num_vertices * sizeof(float4);
+        width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+        vertex_ptrs.push_back(base_ptr);
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      if (hair->curve_shape == CURVE_THICK) {
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        build_input.curveArray.numPrimitives = num_segments;
+        build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.curveArray.numVertices = num_vertices;
+        build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+        build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+        build_input.curveArray.widthStrideInBytes = sizeof(float4);
+        build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+        build_input.curveArray.indexStrideInBytes = sizeof(int);
+        build_input.curveArray.flag = build_flags;
+        build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+      else {
+        /* Disable visibility test any-hit program, since it is already checked during
+         * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+        build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+        build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+        build_input.customPrimitiveArray.numPrimitives = num_segments;
+        build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+        build_input.customPrimitiveArray.flags = &build_flags;
+        build_input.customPrimitiveArray.numSbtRecords = 1;
+        build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+    else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+      /* Build BLAS for triangle primitives. */
+      Mesh *const mesh = static_cast<Mesh *const>(geom);
+      if (mesh->num_triangles() == 0) {
+        return;
+      }
+
+      const size_t num_verts = mesh->get_verts().size();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = mesh->get_motion_steps();
+      }
+
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      index_data.alloc(mesh->get_triangles().size());
+      memcpy(index_data.data(),
+             mesh->get_triangles().data(),
+             mesh->get_triangles().size() * sizeof(int));
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      vertex_data.alloc(num_verts * num_motion_steps);
+
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        const float3 *verts = mesh->get_verts().data();
+
+        size_t center_step = (num_motion_steps - 1) / 2;
+        /* The center step for motion vertices is not stored in the attribute. */
+        if (step != center_step) {
+          verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+        }
+
+        memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+      }
+
+      /* Upload triangle data to GPU. */
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> vertex_ptrs;
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+      build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+      build_input.triangleArray.numVertices = num_verts;
+      build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+      build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+      build_input.triangleArray.indexBuffer = index_data.device_pointer;
+      build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+      build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+      build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+      build_input.triangleArray.flags = &build_flags;
+      /* The SBT does not store per primitive data since Cycles already allocates separate
+       * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+       * one and rely on that having the same meaning in this case. */
+      build_input.triangleArray.numSbtRecords = 1;
+      build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+  }
+  else {
+    unsigned int num_instances = 0;
+    unsigned int max_num_instances = 0xFFFFFFFF;
+
+    bvh_optix->as_data.free();
+    bvh_optix->traversable_handle = 0;
+    bvh_optix->motion_transform_data.free();
+
+    optixDeviceContextGetProperty(context,
+                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+                                  &max_num_instances,
+                                  sizeof(max_num_instances));
+    /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+    max_num_instances >>= 1;
+    if (bvh->objects.size() > max_num_instances) {
+      progress.set_error(
+          "Failed to build OptiX acceleration structure because there are too many instances");
+      return;
+    }
+
+    /* Fill instance descriptions. */
+    device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size());
+
+    /* Calculate total motion transform size and allocate memory for them. */
+    size_t motion_transform_offset = 0;
+    if (motion_blur) {
+      size_t total_motion_transform_size = 0;
+      for (Object *const ob : bvh->objects) {
+        if (ob->is_traceable() && ob->use_motion()) {
+          total_motion_transform_size = align_up(total_motion_transform_size,
+                                                 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          total_motion_transform_size = total_motion_transform_size +
+                                        sizeof(OptixSRTMotionTransform) +
+                                        motion_keys * sizeof(OptixSRTData);
+        }
+      }
+
+      assert(bvh_optix->motion_transform_data.device == this);
+      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+    }
+
+    for (Object *ob : bvh->objects) {
+      /* Skip non-traceable objects. */
+      if (!ob->is_traceable()) {
+        continue;
+      }
+
+      BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+      OptixTraversableHandle handle = blas->traversable_handle;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      /* Clear transform to identity matrix. */
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      /* Set user instance ID to object index (but leave low bit blank). */
+      instance.instanceId = ob->get_device_index() << 1;
+
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      instance.visibilityMask = 1;
+
+      if (ob->get_geometry()->has_volume) {
+        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+         */
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+        /* Same applies to curves (so they can be skipped in local trace calls). */
+        instance.visibilityMask |= 4;
+
+        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+          /* Select between motion blur and non-motion blur built-in intersection module. */
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+      }
+
+      /* Insert motion traversable if object has motion. */
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(this);
+
+        motion_transform_offset = align_up(motion_transform_offset,
+                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                           motion_transform_offset;
+        motion_transform_offset += motion_transform_size;
+
+        /* Allocate host side memory for motion transform and fill it with transform data. */
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->get_motion().size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->get_motion().size());
+        transform_motion_decompose(
+            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+          /* Scale. */
+          srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+          srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+          srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+          /* Shear. */
+          srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+          srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+          srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+          assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+          assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+          assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+          /* Pivot point. */
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          /* Rotation. */
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          /* Translation. */
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
+
+        /* Upload motion transform to GPU. */
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+        /* Disable instance transform if object uses motion transform already. */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        /* Get traversable handle to motion transform. */
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
+
+        if (ob->get_geometry()->is_instanced()) {
+          /* Set transform matrix. */
+          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+        }
+        else {
+          /* Disable instance transform if geometry already has it applied to vertex data. */
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          /* Non-instanced objects read ID from 'prim_object', so distinguish
+           * them from instanced objects with the low bit set. */
+          instance.instanceId |= 1;
+        }
+      }
+    }
+
+    /* Upload instance descriptions. */
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    /* Build top-level acceleration structure (TLAS) */
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+
+    if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+      progress.set_error("Failed to build OptiX acceleration structure");
+    }
+    tlas_handle = bvh_optix->traversable_handle;
+  }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+   * while GPU is still rendering. */
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+  bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  /* Set constant memory for CUDA module. */
+  CUDADevice::const_copy_to(name, host, size);
+
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update traversable handle (since it is different for each device on multi devices). */
+    KernelData *const data = (KernelData *)host;
+    *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+    return;
+  }
+
+  /* Update data storage pointers in launch parameters. */
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+      return; \
+    }
+  KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..742ae0f1bab
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/device_impl.h"
+#  include "device/optix/queue.h"
+#  include "device/optix/util.h"
+#  include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+  PG_RGEN_INTERSECT_CLOSEST,
+  PG_RGEN_INTERSECT_SHADOW,
+  PG_RGEN_INTERSECT_SUBSURFACE,
+  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_SURFACE_RAYTRACE,
+  PG_MISS,
+  PG_HITD, /* Default hit group. */
+  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+  PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITD_MOTION,
+  PG_HITS_MOTION,
+  PG_CALL_SVM_AO,
+  PG_CALL_SVM_BEVEL,
+  PG_CALL_AO_PASS,
+  NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+  char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+  OptixDeviceContext context = NULL;
+
+  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+  OptixModule builtin_modules[2] = {};
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool motion_blur = false;
+  device_vector<SbtRecord> sbt_data;
+  device_only_memory<KernelParamsOptiX> launch_params;
+  OptixTraversableHandle tlas_handle = 0;
+
+  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  thread_mutex delayed_free_bvh_mutex;
+
+  class Denoiser {
+   public:
+    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();
+
+    OptiXDevice *device;
+    OptiXDeviceQueue queue;
+
+    OptixDenoiser optix_denoiser = nullptr;
+
+    /* Configuration size, as provided to `optixDenoiserSetup`.
+     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+     * `is_configured` will be false. */
+    bool is_configured = false;
+    int2 configured_size = make_int2(0, 0);
+
+    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+     * The memory layout goes as following: [denoiser state][scratch buffer]. */
+    device_only_memory<unsigned char> state;
+    size_t scratch_offset = 0;
+    size_t scratch_size = 0;
+
+    bool use_pass_albedo = false;
+    bool use_pass_normal = false;
+  };
+  Denoiser denoiser_;
+
+ public:
+  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+  ~OptiXDevice();
+
+ private:
+  BVHLayoutMask get_bvh_layout_mask() const override;
+
+  string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+  bool load_kernels(const uint kernel_features) override;
+
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  void release_optix_bvh(BVH *bvh) override;
+  void free_bvh_memory_delayed();
+
+  void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void update_launch_params(size_t offset, void *data, size_t data_size);
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* --------------------------------------------------------------------
+   * Denoising.
+   */
+
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+  virtual DeviceQueue *get_denoise_queue() override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre=-processing of the guiding passes is to only hapopen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/queue.h"
+#  include "device/optix/device_impl.h"
+
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+  CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (!is_optix_specific_kernel(kernel)) {
+    return CUDADeviceQueue::enqueue(kernel, work_size, args);
+  }
+
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+                        args[0],  // &d_path_index
+                        sizeof(device_ptr),
+                        cuda_stream_));
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    cuda_device_assert(
+        cuda_device_,
+        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+                          args[1],  // &d_render_buffer
+                          sizeof(device_ptr),
+                          cuda_stream_));
+  }
+
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+  OptixPipeline pipeline = nullptr;
+  OptixShaderBindingTable sbt_params = {};
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+      break;
+
+    default:
+      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+                 << " is attempted to be enqueued.";
+      return false;
+  }
+
+  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+  sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+  /* Launch the ray generation program. */
+  optix_device_assert(optix_device,
+                      optixLaunch(pipeline,
+                                  cuda_stream_,
+                                  launch_params_ptr,
+                                  optix_device->launch_params.data_elements,
+                                  &sbt_params,
+                                  work_size,
+                                  1,
+                                  1));
+
+  return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+  OptiXDeviceQueue(OptiXDevice *device);
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/util.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+
+#  include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+#  define optix_device_assert(optix_device, stmt) \
+    { \
+      OptixResult result = stmt; \
+      if (result != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(result); \
+        optix_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 57f25283f85..8294e716ebe 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -814,7 +814,7 @@ bool Node::socket_is_modified(const SocketType &input) const
   return (socket_modified & input.modified_flag_bit) != 0;
 }
 
-bool Node::is_modified()
+bool Node::is_modified() const
 {
   return socket_modified != 0;
 }
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index aa365baeccd..8f27a82d37b 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <type_traits>
+
 #include "graph/node_type.h"
 
 #include "util/util_array.h"
@@ -34,7 +36,10 @@ struct Transform;
 #define NODE_SOCKET_API_BASE_METHODS(type_, name, string_name) \
   const SocketType *get_##name##_socket() const \
   { \
-    static const SocketType *socket = type->find_input(ustring(string_name)); \
+    /* Explicitly cast to base class to use `Node::type` even if the derived class defines \
+     * `type`. */ \
+    const Node *self_node = this; \
+    static const SocketType *socket = self_node->type->find_input(ustring(string_name)); \
     return socket; \
   } \
   bool name##_is_modified() const \
@@ -111,6 +116,15 @@ struct Node {
   void set(const SocketType &input, const Transform &value);
   void set(const SocketType &input, Node *value);
 
+  /* Implicitly cast enums and enum classes to integer, which matches an internal way of how
+   * enumerator values are stored and accessed in a generic API. */
+  template<class ValueType, typename std::enable_if_t<std::is_enum_v<ValueType>> * = nullptr>
+  void set(const SocketType &input, const ValueType &value)
+  {
+    static_assert(sizeof(ValueType) <= sizeof(int), "Enumerator type should fit int");
+    set(input, static_cast<int>(value));
+  }
+
   /* set array values. the memory from the input array will taken over
    * by the node and the input array will be empty after return */
   void set(const SocketType &input, array<bool> &value);
@@ -164,7 +178,7 @@ struct Node {
 
   bool socket_is_modified(const SocketType &input) const;
 
-  bool is_modified();
+  bool is_modified() const;
 
   void tag_modified();
   void clear_modified();
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+  ..
+)
+
+set(SRC
+  adaptive_sampling.cpp
+  denoiser.cpp
+  denoiser_device.cpp
+  denoiser_oidn.cpp
+  denoiser_optix.cpp
+  path_trace.cpp
+  tile.cpp
+  pass_accessor.cpp
+  pass_accessor_cpu.cpp
+  pass_accessor_gpu.cpp
+  path_trace_work.cpp
+  path_trace_work_cpu.cpp
+  path_trace_work_gpu.cpp
+  render_scheduler.cpp
+  shader_eval.cpp
+  work_balancer.cpp
+  work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+  adaptive_sampling.h
+  denoiser.h
+  denoiser_device.h
+  denoiser_oidn.h
+  denoiser_optix.h
+  path_trace.h
+  tile.h
+  pass_accessor.h
+  pass_accessor_cpu.h
+  pass_accessor_gpu.h
+  path_trace_work.h
+  path_trace_work_cpu.h
+  path_trace_work_gpu.h
+  render_scheduler.h
+  shader_eval.h
+  work_balancer.h
+  work_tile_scheduler.h
+)
+
+set(LIB
+  # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+  # avoid such cyclic dependency.
+  cycles_render
+
+  cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+  )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+  if (!use) {
+    return num_samples;
+  }
+
+  /*
+   * The naive implementation goes as following:
+   *
+   *   int count = 1;
+   *   while (!need_filter(start_sample + count - 1) && count < num_samples) {
+   *     ++count;
+   *   }
+   *   return count;
+   */
+
+  /* 0-based sample index at which first filtering will happen. */
+  const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+  /* Allow as many samples as possible until the first filter sample. */
+  if (start_sample + num_samples <= first_filter_sample) {
+    return num_samples;
+  }
+
+  const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+  const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+  return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (!use) {
+    return false;
+  }
+
+  if (sample <= min_samples) {
+    return false;
+  }
+
+  return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  /* Align number of samples so that they align with the adaptive filtering.
+   *
+   * Returns the new value for the `num_samples` so that after rendering so many samples on top
+   * of `start_sample` filtering is required.
+   *
+   * The alignment happens in a way that allows to render as many samples as possible without
+   * missing any filtering point. This means that the result is "clamped" by the nearest sample
+   * at which filtering is needed. This is part of mechanism which ensures that all devices will
+   * perform same exact filtering and adaptive sampling, regardless of their performance.
+   *
+   * `start_sample` is the 0-based index of sample.
+   *
+   * NOTE: The start sample is included into the number of samples to render. This means that
+   * if the number of samples is 1, then the path tracer will render samples [align_samples],
+   * if the number of samples is 2, then the path tracer will render samples [align_samples,
+   * align_samples + 1] and so on. */
+  int align_samples(int start_sample, int num_samples) const;
+
+  /* Check whether adaptive sampling filter should happen at this sample.
+   * Returns false if the adaptive sampling is not use.
+   *
+   * `sample` is the 0-based index of sample. */
+  bool need_filter(int sample) const;
+
+  bool use = false;
+  int adaptive_step = 0;
+  int min_samples = 0;
+  float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+  DCHECK(params.use);
+
+  switch (params.type) {
+    case DENOISER_OPTIX:
+      return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+    case DENOISER_OPENIMAGEDENOISE:
+      return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      /* pass */
+      break;
+  }
+
+  LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+  return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+    : path_trace_device_(path_trace_device), params_(params)
+{
+  DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+  DCHECK_EQ(params.type, params_.type);
+
+  if (params.type == params_.type) {
+    params_ = params;
+  }
+  else {
+    LOG(ERROR) << "Attempt to change denoiser type.";
+  }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+  return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+  const Device *denoiser_device = ensure_denoiser_device(progress);
+
+  if (!denoiser_device) {
+    path_trace_device_->set_error("No device available to denoise on");
+    return false;
+  }
+
+  VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+          << denoiser_device->info.id << ")";
+
+  return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+  return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+  if (device->info.type == DEVICE_MULTI) {
+    /* Assume multi-device is never created with a single sub-device.
+     * If one requests such configuration it should be checked on the session level. */
+    return false;
+  }
+
+  if (!device->info.multi_devices.empty()) {
+    /* Some configurations will use multi_devices, but keep the type of an individual device.
+     * This does simplify checks for homogenous setups, but here we really need a single device. */
+    return false;
+  }
+
+  /* Check the denoiser type is supported. */
+  return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+  Device *best_device = nullptr;
+
+  device->foreach_device([&](Device *sub_device) {
+    if ((sub_device->info.denoisers & type) == 0) {
+      return;
+    }
+    if (!best_device) {
+      best_device = sub_device;
+    }
+    else {
+      /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+       * of the device and data transfer cost. */
+    }
+  });
+
+  return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+                                                 const uint device_type_mask)
+{
+  const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+  if (device_infos.empty()) {
+    return nullptr;
+  }
+
+  /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+   * a physical CUDA device which is already used for rendering. */
+
+  /* TODO(sergey): Choose fastest device for denoising. */
+
+  const DeviceInfo denoiser_device_info = device_infos.front();
+
+  unique_ptr<Device> denoiser_device(
+      Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (denoiser_device->have_error()) {
+    return nullptr;
+  }
+
+  /* Only need denoising feature, everything else is unused. */
+  if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+    return nullptr;
+  }
+
+  return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+  /* The best device has been found already, avoid sequential lookups.
+   * Additionally, avoid device re-creation if it has failed once. */
+  if (denoiser_device_ || device_creation_attempted_) {
+    return denoiser_device_;
+  }
+
+  /* Simple case: rendering happens on a single device which also supports denoiser. */
+  if (is_single_supported_device(path_trace_device_, params_.type)) {
+    denoiser_device_ = path_trace_device_;
+    return denoiser_device_;
+  }
+
+  /* Find best device from the ones which are already used for rendering. */
+  denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+  if (denoiser_device_) {
+    return denoiser_device_;
+  }
+
+  if (progress) {
+    progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+  }
+
+  device_creation_attempted_ = true;
+
+  const uint device_type_mask = get_device_type_mask();
+  local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+  denoiser_device_ = local_denoiser_device_.get();
+
+  return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..3101b45e31b
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denosiing algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+  /* Create denoiser for the given path trace device.
+   *
+   * Notes:
+   * - The denoiser must be configured. This means that `params.use` must be true.
+   *   This is checked in debug builds.
+   * - The device might be MultiDevice. */
+  static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual ~Denoiser() = default;
+
+  void set_params(const DenoiseParams &params);
+  const DenoiseParams &get_params() const;
+
+  /* Create devices and load kernels needed for denoising.
+   * The progress is used to communicate state when kenrels actually needs to be loaded.
+   *
+   * NOTE: The `progress` is an optional argument, can be nullptr. */
+  virtual bool load_kernels(Progress *progress);
+
+  /* Denoise the entire buffer.
+   *
+   * Buffer parameters denotes an effective parameters used during rendering. It could be
+   * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+   * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+   *
+   * The buffer might be copming from a "foreign" device from what this denoise is created for.
+   * This means that in general case the denoiser will make sure the input data is available on
+   * the denoiser device, perform denoising, and put data back to the device where the buffer
+   * came from.
+   *
+   * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+   * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+   * or which needs "final" value for data passes.
+   *
+   * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+   * modification of the input passes (scaling them down i.e.). This will lower the memory
+   * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+   * view.
+   *
+   * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+   * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) = 0;
+
+  /* Get a device which is used to perform actual denoising.
+   *
+   * Notes:
+   *
+   * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+   *
+   * - The device can be different from the path tracing device. This happens, for example, when
+   *   using OptiX denoiser and rendering on CPU.
+   *
+   * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+   *   that there is no threadingconflict between denoising task lazily initializing the device and
+   *   access to this device happen. */
+  Device *get_denoiser_device() const;
+
+  function<bool(void)> is_cancelled_cb;
+
+  bool is_cancelled() const
+  {
+    if (!is_cancelled_cb) {
+      return false;
+    }
+    return is_cancelled_cb();
+  }
+
+ protected:
+  Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  /* Make sure denoising device is initialized. */
+  virtual Device *ensure_denoiser_device(Progress *progress);
+
+  /* Get device type mask which is used to filter available devices when new device needs to be
+   * created. */
+  virtual uint get_device_type_mask() const = 0;
+
+  Device *path_trace_device_;
+  DenoiseParams params_;
+
+  /* Cached pointer to the device on which denoising will happen.
+   * Used to avoid lookup of a device for every denoising request. */
+  Device *denoiser_device_ = nullptr;
+
+  /* Denoiser device which was created to perform denoising in the case the none of the rendering
+   * devices are capable of denoising. */
+  unique_ptr<Device> local_denoiser_device_;
+  bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..8088cfd7800
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+  /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                    RenderBuffers *render_buffers,
+                                    const int num_samples,
+                                    bool allow_inplace_modification)
+{
+  Device *denoiser_device = get_denoiser_device();
+  if (!denoiser_device) {
+    return false;
+  }
+
+  DeviceDenoiseTask task;
+  task.params = params_;
+  task.num_samples = num_samples;
+  task.buffer_params = buffer_params;
+  task.allow_inplace_modification = allow_inplace_modification;
+
+  RenderBuffers local_render_buffers(denoiser_device);
+  bool local_buffer_used = false;
+
+  if (denoiser_device == render_buffers->buffer.device) {
+    /* The device can access an existing buffer pointer. */
+    local_buffer_used = false;
+    task.render_buffers = render_buffers;
+  }
+  else {
+    VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+    DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+    /* Create buffer which is available by the device used by denoiser. */
+
+    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+     * ignoring other light ad data passes. */
+
+    local_buffer_used = true;
+
+    render_buffers->copy_from_device();
+
+    local_render_buffers.reset(buffer_params);
+
+    /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+     * the input render buffer is allcoated for the lowest resolution divider possible. So it is
+     * important to only copy actually needed part of the input buffer. */
+    memcpy(local_render_buffers.buffer.data(),
+           render_buffers->buffer.data(),
+           sizeof(float) * local_render_buffers.buffer.size());
+
+    queue->copy_to_device(local_render_buffers.buffer);
+
+    task.render_buffers = &local_render_buffers;
+    task.allow_inplace_modification = true;
+  }
+
+  const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+  if (local_buffer_used) {
+    local_render_buffers.copy_from_device();
+
+    render_buffers_host_copy_denoised(
+        render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+    render_buffers->copy_to_device();
+  }
+
+  return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~DeviceDenoiser();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..1b5a012ec87
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+  DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+  DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+  OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+  return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+  OIDNPass() = default;
+
+  OIDNPass(const BufferParams &buffer_params,
+           const char *name,
+           PassType type,
+           PassMode mode = PassMode::NOISY)
+      : name(name), type(type), mode(mode)
+  {
+    offset = buffer_params.get_pass_offset(type, mode);
+    need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  inline operator bool() const
+  {
+    return name[0] != '\0';
+  }
+
+  /* Name of an image which will be passed to the OIDN library.
+   * Should be one of the following: color, albedo, normal, output.
+   * The albedo and normal images are optional. */
+  const char *name = "";
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  int num_components = -1;
+  bool use_compositing = false;
+  bool use_denoising_albedo = true;
+
+  /* Offset of beginning of this pass in the render buffers. */
+  int offset = -1;
+
+  /* Denotes whether the data is to be scaled down with the number of passes.
+   * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+   * scaling is not needed for the color pass unless adaptive sampling is used.
+   *
+   * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer.
+   * All the scaling on the output needed for integration with adaptive sampling will happen
+   * outside of generic pass handling. */
+  bool need_scale = false;
+
+  /* The content of the pass has been pre-filtered. */
+  bool is_filtered = false;
+
+  /* For the scaled passes, the data which holds values of scaled pixels. */
+  array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+  OIDNDenoiseContext(OIDNDenoiser *denoiser,
+                     const DenoiseParams &denoise_params,
+                     const BufferParams &buffer_params,
+                     RenderBuffers *render_buffers,
+                     const int num_samples,
+                     const bool allow_inplace_modification)
+      : denoiser_(denoiser),
+        denoise_params_(denoise_params),
+        buffer_params_(buffer_params),
+        render_buffers_(render_buffers),
+        num_samples_(num_samples),
+        allow_inplace_modification_(allow_inplace_modification),
+        pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+  {
+    if (denoise_params_.use_pass_albedo) {
+      oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+    }
+
+    if (denoise_params_.use_pass_normal) {
+      oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+    }
+  }
+
+  bool need_denoising() const
+  {
+    if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /* Make the guiding passes available by a sequential denoising of various passes. */
+  void read_guiding_passes()
+  {
+    read_guiding_pass(oidn_albedo_pass_);
+    read_guiding_pass(oidn_normal_pass_);
+  }
+
+  void denoise_pass(const PassType pass_type)
+  {
+    OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+    if (oidn_color_pass.offset == PASS_UNUSED) {
+      return;
+    }
+
+    if (oidn_color_pass.use_denoising_albedo) {
+      if (albedo_replaced_with_fake_) {
+        LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+        return;
+      }
+    }
+
+    OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+    if (oidn_output_pass.offset == PASS_UNUSED) {
+      LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+      return;
+    }
+
+    OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+    oidn::DeviceRef oidn_device = oidn::newDevice();
+    oidn_device.commit();
+
+    /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+     */
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_input_pass(oidn_filter, oidn_color_access_pass);
+    set_guiding_passes(oidn_filter, oidn_color_pass);
+    set_output_pass(oidn_filter, oidn_output_pass);
+    oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+    oidn_filter.set("hdr", true);
+    oidn_filter.set("srgb", false);
+    if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+        denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+      oidn_filter.set("cleanAux", true);
+    }
+    oidn_filter.commit();
+
+    filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+    filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+    /* Filter the beauty image. */
+    oidn_filter.execute();
+
+    /* Check for errors. */
+    const char *error_message;
+    const oidn::Error error = oidn_device.getError(error_message);
+    if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+      LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+    }
+
+    postprocess_output(oidn_color_pass, oidn_output_pass);
+  }
+
+ protected:
+  void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+  {
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+        oidn_pass.is_filtered) {
+      return;
+    }
+
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_pass(oidn_filter, oidn_pass);
+    set_output_pass(oidn_filter, oidn_pass);
+    oidn_filter.commit();
+    oidn_filter.execute();
+
+    oidn_pass.is_filtered = true;
+  }
+
+  /* Make pixels of a guiding pass available by the denoiser. */
+  void read_guiding_pass(OIDNPass &oidn_pass)
+  {
+    if (!oidn_pass) {
+      return;
+    }
+
+    DCHECK(!oidn_pass.use_compositing);
+
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+        !is_pass_scale_needed(oidn_pass)) {
+      /* Pass data is available as-is from the render buffers. */
+      return;
+    }
+
+    if (allow_inplace_modification_) {
+      scale_pass_in_render_buffers(oidn_pass);
+      return;
+    }
+
+    read_pass_pixels_into_buffer(oidn_pass);
+  }
+
+  /* Special reader of the input pass.
+   * To save memory it will read pixels into the output, and let the denoiser to perform an
+   * in-place operation. */
+  OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    const bool use_compositing = oidn_input_pass.use_compositing;
+
+    /* Simple case: no compositing is involved, no scaling is needed.
+     * The pass pixels will be referenced as-is, without extra processing. */
+    if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+      return oidn_input_pass;
+    }
+
+    float *buffer_data = render_buffers_->buffer.data();
+    float *pass_data = buffer_data + oidn_output_pass.offset;
+
+    PassAccessor::Destination destination(pass_data, 3);
+    destination.pixel_stride = buffer_params_.pass_stride;
+
+    read_pass_pixels(oidn_input_pass, destination);
+
+    OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+    oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+    return oidn_input_pass_at_output;
+  }
+
+  /* Read pass pixels using PassAccessor into the given destination. */
+  void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+  {
+    PassAccessor::PassAccessInfo pass_access_info;
+    pass_access_info.type = oidn_pass.type;
+    pass_access_info.mode = oidn_pass.mode;
+    pass_access_info.offset = oidn_pass.offset;
+
+    /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+     * on the approximation. The latter is not even possible because OIDN does not support
+     * denoising of semi-transparent pixels. */
+    pass_access_info.use_approximate_shadow_catcher = false;
+    pass_access_info.use_approximate_shadow_catcher_background = false;
+    pass_access_info.show_active_pixels = false;
+
+    /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+     * by users. What is important is to use same exposure for read and write access of the pass
+     * pixels. */
+    const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+    pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+  }
+
+  /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+  void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+  {
+    VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+            << pass_type_as_string(oidn_pass.type) << ")";
+
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+    scaled_buffer.resize(width * height * 3);
+
+    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  /* Set OIDN image to reference pixels from the given render buffer pass.
+   * No transform to the pixels is done, no additional memory is used. */
+  void set_pass_referenced(oidn::FilterRef &oidn_filter,
+                           const char *name,
+                           const OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+
+    const int64_t pixel_index = offset + x + y * stride;
+    const int64_t buffer_offset = pixel_index * pass_stride;
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    oidn_filter.setImage(name,
+                         buffer_data + buffer_offset + oidn_pass.offset,
+                         oidn::Format::Float3,
+                         width,
+                         height,
+                         0,
+                         pass_stride * sizeof(float),
+                         stride * pass_stride * sizeof(float));
+  }
+
+  void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    oidn_filter.setImage(
+        name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+  }
+
+  void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+  void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    if (oidn_pass.scaled_buffer.empty()) {
+      set_pass_referenced(oidn_filter, name, oidn_pass);
+    }
+    else {
+      set_pass_from_buffer(oidn_filter, name, oidn_pass);
+    }
+  }
+
+  void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+
+  void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    if (oidn_albedo_pass_) {
+      if (oidn_pass.use_denoising_albedo) {
+        set_pass(oidn_filter, oidn_albedo_pass_);
+      }
+      else {
+        /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+         * provided. */
+        set_fake_albedo_pass(oidn_filter);
+      }
+    }
+
+    if (oidn_normal_pass_) {
+      set_pass(oidn_filter, oidn_normal_pass_);
+    }
+  }
+
+  void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    if (!albedo_replaced_with_fake_) {
+      const int64_t num_pixel_components = width * height * 3;
+      oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+      for (int i = 0; i < num_pixel_components; ++i) {
+        oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+      }
+
+      albedo_replaced_with_fake_ = true;
+    }
+
+    set_pass(oidn_filter, oidn_albedo_pass_);
+  }
+
+  void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, "output", oidn_pass);
+  }
+
+  /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+   * back. */
+  void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+    const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+        if (need_scale) {
+          const float pixel_scale = has_pass_sample_count ?
+                                        __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                        num_samples_;
+
+          denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+          denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+          denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+        }
+
+        if (oidn_output_pass.num_components == 3) {
+          /* Pass without alpha channel. */
+        }
+        else if (!oidn_input_pass.use_compositing) {
+          /* Currently compositing passes are either 3-component (derived by dividing light passes)
+           * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+           * simplifies logic and avoids extra memory allocation. */
+          const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+          denoised_pixel[3] = noisy_pixel[3];
+        }
+        else {
+          /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+           * is an opaque pixel for 4 component passes. */
+          denoised_pixel[3] = 0;
+        }
+      }
+    }
+  }
+
+  bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+  {
+    if (pass_sample_count_ != PASS_UNUSED) {
+      /* With adaptive sampling pixels will have different number of samples in them, so need to
+       * always scale the pass to make pixels uniformly sampled. */
+      return true;
+    }
+
+    if (!oidn_pass.need_scale) {
+      return false;
+    }
+
+    if (num_samples_ == 1) {
+      /* If the avoid scaling if there is only one sample, to save up time (so we dont divide
+       * buffer by 1). */
+      return false;
+    }
+
+    return true;
+  }
+
+  void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+        const float pixel_scale = 1.0f / (has_pass_sample_count ?
+                                              __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                              num_samples_);
+
+        pass_pixel[0] = pass_pixel[0] * pixel_scale;
+        pass_pixel[1] = pass_pixel[1] * pixel_scale;
+        pass_pixel[2] = pass_pixel[2] * pixel_scale;
+      }
+    }
+  }
+
+  OIDNDenoiser *denoiser_ = nullptr;
+
+  const DenoiseParams &denoise_params_;
+  const BufferParams &buffer_params_;
+  RenderBuffers *render_buffers_ = nullptr;
+  int num_samples_ = 0;
+  bool allow_inplace_modification_ = false;
+  int pass_sample_count_ = PASS_UNUSED;
+
+  /* Optional albedo and normal passes, reused by denoising of different pass types. */
+  OIDNPass oidn_albedo_pass_;
+  OIDNPass oidn_normal_pass_;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+  Device *device = render_buffers->buffer.device;
+  if (device->info.has_gpu_queue) {
+    return device->gpu_queue_create();
+  }
+  return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+                                            RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_from_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_from_device();
+  }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+                                          RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_to_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_to_device();
+  }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                  RenderBuffers *render_buffers,
+                                  const int num_samples,
+                                  bool allow_inplace_modification)
+{
+  thread_scoped_lock lock(mutex_);
+
+  /* Make sure the host-side data is available for denoising. */
+  unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+  copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+  OIDNDenoiseContext context(
+      this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+  if (context.need_denoising()) {
+    context.read_guiding_passes();
+
+    const std::array<PassType, 3> passes = {
+        {/* Passes which will use real albedo when it is available. */
+         PASS_COMBINED,
+         PASS_SHADOW_CATCHER_MATTE,
+
+         /* Passes which do not need albedo and hence if real is present it needs to become fake.
+          */
+         PASS_SHADOW_CATCHER}};
+
+    for (const PassType pass_type : passes) {
+      context.denoise_pass(pass_type);
+      if (is_cancelled()) {
+        return false;
+      }
+    }
+
+    /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+     * copies data from the device it doesn't overwrite the denoiser buffers. */
+    copy_render_buffers_to_device(queue, render_buffers);
+  }
+#endif
+
+  /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+   * it up here all passes are properly denoised. */
+  return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+  /* Forwardly declared state which might be using compile-flag specific fields, such as
+   * OpenImageDenoise device and filter handles. */
+  class State;
+
+  OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+
+  /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+   * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+  static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/integrator/denoiser_optix.cpp
index ed64ae01aae..5f9de23bfe6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,21 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_direct_lighting.h"
+#include "integrator/denoiser_optix.h"
 
-#define KERNEL_NAME direct_lighting
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "device/device.h"
+#include "device/device_denoise.h"
 
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/integrator/denoiser_optix.h
index c314dc96c33..a8df770ecf7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_lamp_emission.h"
+#pragma once
 
-#define KERNEL_NAME lamp_emission
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "integrator/denoiser_device.h"
 
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+    : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+    : Destination(pass_type)
+{
+  pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+  const PassInfo pass_info = Pass::get_info(pass_type);
+  num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+    : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+                       const PassAccessor::Destination &destination,
+                       const int src_num_components)
+{
+  /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+   * fill in the additional components for convenience. */
+  const int dest_num_components = destination.num_components;
+
+  if (src_num_components >= dest_num_components) {
+    return;
+  }
+
+  const size_t size = buffer_params.width * buffer_params.height;
+  if (destination.pixels) {
+    float *pixel = destination.pixels;
+
+    for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[1] = pixel[0];
+        pixel[2] = pixel[0];
+      }
+      if (dest_num_components >= 4) {
+        pixel[3] = 1.0f;
+      }
+    }
+  }
+
+  if (destination.pixels_half_rgba) {
+    const half one = float_to_half(1.0f);
+    half4 *pixel = destination.pixels_half_rgba;
+
+    for (size_t i = 0; i < size; i++, pixel++) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[0].y = pixel[0].x;
+        pixel[0].z = pixel[0].x;
+      }
+      if (dest_num_components >= 4) {
+        pixel[0].w = one;
+      }
+    }
+  }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const BufferParams &buffer_params,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  if (pass_access_info_.offset == PASS_UNUSED) {
+    return false;
+  }
+
+  const PassType type = pass_access_info_.type;
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+  if (pass_info.num_components == 1) {
+    /* Single channel passes. */
+    if (mode == PassMode::DENOISED) {
+      /* Denoised passes store their final pixels, no need in special calculation. */
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_RENDER_TIME) {
+      /* TODO(sergey): Needs implementation. */
+    }
+    else if (type == PASS_DEPTH) {
+      get_pass_depth(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_MIST) {
+      get_pass_mist(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SAMPLE_COUNT) {
+      get_pass_sample_count(render_buffers, buffer_params, destination);
+    }
+    else {
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+  }
+  else if (type == PASS_MOTION) {
+    /* Motion pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+    get_pass_motion(render_buffers, buffer_params, destination);
+  }
+  else if (type == PASS_CRYPTOMATTE) {
+    /* Cryptomatte pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+    get_pass_cryptomatte(render_buffers, buffer_params, destination);
+  }
+  else {
+    /* RGB, RGBA and vector passes. */
+    DCHECK(destination.num_components == 3 || destination.num_components == 4)
+        << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+    if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+      /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+       * to approximate shadow with). */
+      get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+      /* Shadow catcher pass. */
+      get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+    }
+    else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+              pass_info.indirect_type != PASS_NONE) &&
+             mode != PassMode::DENOISED) {
+      /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+      get_pass_light_path(render_buffers, buffer_params, destination);
+    }
+    else {
+      /* Passes that need no special computation, or denoised passes that already
+       * had the computation done. */
+      if (pass_info.num_components == 3) {
+        get_pass_float3(render_buffers, buffer_params, destination);
+      }
+      else if (pass_info.num_components == 4) {
+        if (destination.num_components == 3) {
+          /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+          get_pass_float3(render_buffers, buffer_params, destination);
+        }
+        else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+                 type == PASS_SHADOW_CATCHER_MATTE) {
+          /* Passes with transparency as 4th component. */
+          get_pass_combined(render_buffers, buffer_params, destination);
+        }
+        else {
+          /* Passes with alpha as 4th component. */
+          get_pass_float4(render_buffers, buffer_params, destination);
+        }
+      }
+    }
+  }
+
+  pad_pixels(buffer_params, destination, pass_info.num_components);
+
+  return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination) const
+{
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+                                             pass_access_info_.include_albedo);
+
+  kfilm_convert->pass_offset = pass_access_info_.offset;
+  kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+  kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+  kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+  /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+  if (pass_info.direct_type != PASS_NONE) {
+    kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+  }
+  kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+  kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+  kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+  kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+      PASS_ADAPTIVE_AUX_BUFFER);
+  kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+  kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+  kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_MATTE, mode);
+
+  /* Background is not denoised, so always use noisy pass. */
+  kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+  if (pass_info.use_filter) {
+    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  }
+  else {
+    kfilm_convert->scale = 1.0f;
+  }
+
+  if (pass_info.use_exposure) {
+    kfilm_convert->exposure = exposure_;
+  }
+  else {
+    kfilm_convert->exposure = 1.0f;
+  }
+
+  kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+  kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+  kfilm_convert->use_approximate_shadow_catcher_background =
+      pass_access_info_.use_approximate_shadow_catcher_background;
+  kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+  kfilm_convert->num_components = destination.num_components;
+  kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                           destination.num_components;
+
+  kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+                                            pass_access_info_.include_albedo);
+
+  const BufferParams &buffer_params = render_buffers->params;
+
+  float *buffer_data = render_buffers->buffer.data();
+  const int size = buffer_params.width * buffer_params.height;
+
+  const int out_stride = buffer_params.pass_stride;
+  const int in_stride = source.num_components;
+  const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+  float *out = buffer_data + pass_access_info_.offset;
+  const float *in = source.pixels + source.offset * in_stride;
+
+  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+    memcpy(out, in, sizeof(float) * num_components_to_copy);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+  class PassAccessInfo {
+   public:
+    PassAccessInfo() = default;
+    explicit PassAccessInfo(const BufferPass &pass);
+
+    PassType type = PASS_NONE;
+    PassMode mode = PassMode::NOISY;
+    bool include_albedo = false;
+    int offset = -1;
+
+    /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+     * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+     */
+    bool use_approximate_shadow_catcher = false;
+
+    /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+    bool use_approximate_shadow_catcher_background = false;
+
+    bool show_active_pixels = false;
+  };
+
+  class Destination {
+   public:
+    Destination() = default;
+    Destination(float *pixels, int num_components);
+    Destination(const PassType pass_type, half4 *pixels);
+
+    /* Destination will be initialized with the number of components which is native for the given
+     * pass type. */
+    explicit Destination(const PassType pass_type);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    float *pixels = nullptr;
+    half4 *pixels_half_rgba = nullptr;
+
+    /* Device-side pointers. */
+    device_ptr d_pixels = 0;
+    device_ptr d_pixels_half_rgba = 0;
+
+    /* Number of components per pixel in the floating-point destination.
+     * Is ignored for half4 destination (where number of components is implied to be 4). */
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+
+    /* Number of floats per pixel. When zero is the same as `num_components`.
+     *
+     * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+     * half-floats. */
+    int pixel_stride = 0;
+
+    /* Row stride in pixel elements:
+     *  - For the float destination stride is a number of floats per row.
+     *  - For the half4 destination stride is a number of half4 per row. */
+    int stride = 0;
+  };
+
+  class Source {
+   public:
+    Source() = default;
+    Source(const float *pixels, int num_components);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    const float *pixels = nullptr;
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+  };
+
+  PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+  virtual ~PassAccessor() = default;
+
+  /* Get pass data from the given render buffers, perform needed filtering, and store result into
+   * the pixels.
+   * The result is stored sequentially starting from the very beginning of the pixels memory. */
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const Destination &destination) const;
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const BufferParams &buffer_params,
+                              const Destination &destination) const;
+  /* Set pass data for the given render buffers. Used for baking to read from passes. */
+  bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+  virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                        const BufferParams &buffer_params,
+                                        const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const = 0;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+  PassAccessInfo pass_access_info_;
+
+  float exposure_ = 0.0f;
+  int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+  const float *buffer_data = render_buffers->buffer.data();
+  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                      destination.num_components;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+      float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+      processor(kfilm_convert, buffer, pixel);
+    }
+  });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  const float *buffer_data = render_buffers->buffer.data();
+
+  half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    half4 *dst_row_start = dst_start + y * destination_stride;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+
+      float pixel[4];
+      processor(kfilm_convert, buffer, pixel);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+      half4 *pixel_half_rgba = dst_row_start + x;
+      float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+    }
+  });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+  void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+  using PassAccessor::PassAccessor;
+
+ protected:
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+                                 const PassAccessInfo &pass_access_info,
+                                 float exposure,
+                                 int num_samples)
+    : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+                                               const RenderBuffers *render_buffers,
+                                               const BufferParams &buffer_params,
+                                               const Destination &destination) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  if (destination.d_pixels) {
+    DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+  if (destination.d_pixels_half_rgba) {
+    const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel_half_float, work_size, args);
+  }
+
+  queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+  void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_film_convert_kernels( \
+        DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+  PassAccessorGPU(DeviceQueue *queue,
+                  const PassAccessInfo &pass_access_info,
+                  float exposure,
+                  int num_samples);
+
+ protected:
+  void run_film_convert_kernels(DeviceKernel kernel,
+                                const RenderBuffers *render_buffers,
+                                const BufferParams &buffer_params,
+                                const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth);
+  DECLARE_PASS_ACCESSOR(mist);
+  DECLARE_PASS_ACCESSOR(sample_count);
+  DECLARE_PASS_ACCESSOR(float);
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path);
+  DECLARE_PASS_ACCESSOR(float3);
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion);
+  DECLARE_PASS_ACCESSOR(cryptomatte);
+  DECLARE_PASS_ACCESSOR(shadow_catcher);
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+  DECLARE_PASS_ACCESSOR(combined);
+  DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+  DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..6c02316ac2b
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1147 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+                     Film *film,
+                     DeviceScene *device_scene,
+                     RenderScheduler &render_scheduler,
+                     TileManager &tile_manager)
+    : device_(device),
+      device_scene_(device_scene),
+      render_scheduler_(render_scheduler),
+      tile_manager_(tile_manager)
+{
+  DCHECK_NE(device_, nullptr);
+
+  {
+    vector<DeviceInfo> cpu_devices;
+    device_cpu_info(cpu_devices);
+
+    cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+  }
+
+  /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+   * as possible. */
+  device_->foreach_device([&](Device *path_trace_device) {
+    path_trace_works_.emplace_back(PathTraceWork::create(
+        path_trace_device, film, device_scene, &render_cancel_.is_requested));
+  });
+
+  work_balance_infos_.resize(path_trace_works_.size());
+  work_balance_do_initial(work_balance_infos_);
+
+  render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+  /* Destroy any GPU resource which was used for graphics interop.
+   * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+   * used for interop. */
+  if (gpu_display_) {
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->destroy_gpu_resources(gpu_display_.get());
+    }
+  }
+}
+
+void PathTrace::load_kernels()
+{
+  if (denoiser_) {
+    denoiser_->load_kernels(progress_);
+  }
+}
+
+void PathTrace::alloc_work_memory()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->alloc_work_memory();
+  }
+}
+
+bool PathTrace::ready_to_reset()
+{
+  /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+   * display. Of there is no such display, the logic here will break. */
+  DCHECK(gpu_display_);
+
+  /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+   * General idea is to be able to reset as quickly as possible, while still providing interactive
+   * feel.
+   *
+   * If the render result was ever drawn after previous reset, consider that reset is now possible.
+   * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+   * whether CPU or GPU drawing pipeline is used.
+   *
+   * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+   * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+  if (did_draw_after_reset_) {
+    return true;
+  }
+
+  return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+  if (big_tile_params_.modified(big_tile_params)) {
+    big_tile_params_ = big_tile_params;
+    render_state_.need_reset_params = true;
+  }
+
+  full_params_ = full_params;
+
+  /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+   * properly updated. */
+  if (gpu_display_) {
+    gpu_display_->reset(full_params);
+  }
+
+  render_state_.has_denoised_result = false;
+  render_state_.tile_written = false;
+
+  did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+  /* Free render buffers used by the path trace work to reduce memory peak. */
+  BufferParams empty_params;
+  empty_params.pass_stride = 0;
+  empty_params.update_offset_stride();
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->get_render_buffers()->reset(empty_params);
+  }
+  render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+  progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+  /* Indicate that rendering has started and that it can be requested to cancel. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    if (render_cancel_.is_requested) {
+      return;
+    }
+    render_cancel_.is_rendering = true;
+  }
+
+  render_pipeline(render_work);
+
+  /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+   * on. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    render_cancel_.is_rendering = false;
+    render_cancel_.condition.notify_one();
+  }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+  /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is
+   * checked in Session and the work in the event of cancel is to be finished here. */
+
+  render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+                                                  0);
+
+  render_init_kernel_execution();
+
+  render_scheduler_.report_work_begin(render_work);
+
+  init_render_buffers(render_work);
+
+  rebalance(render_work);
+
+  path_trace(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  adaptive_sample(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  cryptomatte_postprocess(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  denoise(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  write_tile_buffer(render_work);
+  update_display(render_work);
+
+  progress_update_if_needed(render_work);
+
+  finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->init_execution();
+  }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+                                         const vector<WorkBalanceInfo> &work_balance_infos,
+                                         const BufferParams &buffer_params,
+                                         const Callback &callback)
+{
+  const int num_works = path_trace_works.size();
+  const int height = buffer_params.height;
+
+  int current_y = 0;
+  for (int i = 0; i < num_works; ++i) {
+    const double weight = work_balance_infos[i].weight;
+    const int slice_height = max(lround(height * weight), 1);
+
+    /* Disallow negative values to deal with situations when there are more compute devices than
+     * scanlines. */
+    const int remaining_height = max(0, height - current_y);
+
+    BufferParams slide_params = buffer_params;
+    slide_params.full_y = buffer_params.full_y + current_y;
+    if (i < num_works - 1) {
+      slide_params.height = min(slice_height, remaining_height);
+    }
+    else {
+      slide_params.height = remaining_height;
+    }
+
+    slide_params.update_offset_stride();
+
+    callback(path_trace_works[i].get(), slide_params);
+
+    current_y += slide_params.height;
+  }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               big_tile_params_,
+                               [](PathTraceWork *path_trace_work, const BufferParams &params) {
+                                 RenderBuffers *buffers = path_trace_work->get_render_buffers();
+                                 buffers->reset(params);
+                               });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+  BufferParams scaled_params = params;
+
+  scaled_params.width = max(1, params.width / resolution_divider);
+  scaled_params.height = max(1, params.height / resolution_divider);
+  scaled_params.full_x = params.full_x / resolution_divider;
+  scaled_params.full_y = params.full_y / resolution_divider;
+  scaled_params.full_width = params.full_width / resolution_divider;
+  scaled_params.full_height = params.full_height / resolution_divider;
+
+  scaled_params.update_offset_stride();
+
+  return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+  const int resolution_divider = render_work.resolution_divider;
+
+  const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+  const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+                                                                  resolution_divider);
+
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               scaled_big_tile_params,
+                               [&](PathTraceWork *path_trace_work, const BufferParams params) {
+                                 path_trace_work->set_effective_buffer_params(
+                                     scaled_full_params, scaled_big_tile_params, params);
+                               });
+
+  render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+  if (render_state_.need_reset_params) {
+    update_allocated_work_buffer_params();
+  }
+
+  if (render_state_.need_reset_params ||
+      render_state_.resolution_divider != render_work.resolution_divider) {
+    update_effective_work_buffer_params(render_work);
+  }
+
+  render_state_.resolution_divider = render_work.resolution_divider;
+  render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+  update_work_buffer_params_if_needed(render_work);
+
+  /* Handle initialization scheduled by the render scheduler. */
+  if (render_work.init_render_buffers) {
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->zero_render_buffers();
+    });
+
+    tile_buffer_read();
+  }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+  if (!render_work.path_trace.num_samples) {
+    return;
+  }
+
+  VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+          << " samples at the resolution divider " << render_work.resolution_divider;
+
+  const double start_time = time_dt();
+
+  const int num_works = path_trace_works_.size();
+
+  tbb::parallel_for(0, num_works, [&](int i) {
+    const double work_start_time = time_dt();
+    const int num_samples = render_work.path_trace.num_samples;
+
+    PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+    PathTraceWork::RenderStatistics statistics;
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+    const double work_time = time_dt() - work_start_time;
+    work_balance_infos_[i].time_spent += work_time;
+    work_balance_infos_[i].occupancy = statistics.occupancy;
+
+    VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+            << work_time / num_samples
+            << " seconds per sample), occupancy: " << statistics.occupancy;
+  });
+
+  float occupancy_accum = 0.0f;
+  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+    occupancy_accum += balance_info.occupancy;
+  }
+  const float occupancy = occupancy_accum / num_works;
+  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+  render_scheduler_.report_path_trace_time(
+      render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+  if (!render_work.adaptive_sampling.filter) {
+    return;
+  }
+
+  bool did_reschedule_on_idle = false;
+
+  while (true) {
+    VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+            << render_work.adaptive_sampling.threshold;
+    if (render_work.adaptive_sampling.reset) {
+      VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+    }
+
+    const double start_time = time_dt();
+
+    uint num_active_pixels = 0;
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      const uint num_active_pixels_in_work =
+          path_trace_work->adaptive_sampling_converge_filter_count_active(
+              render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+      if (num_active_pixels_in_work) {
+        atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+      }
+    });
+
+    render_scheduler_.report_adaptive_filter_time(
+        render_work, time_dt() - start_time, is_cancel_requested());
+
+    if (num_active_pixels == 0) {
+      VLOG(3) << "All pixels converged.";
+      if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+        break;
+      }
+      VLOG(3) << "Continuing with lower threshold.";
+    }
+    else if (did_reschedule_on_idle) {
+      break;
+    }
+    else if (num_active_pixels < 128 * 128) {
+      /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+       * there is no performance loss from the progressive noise floor feature.
+       *
+       * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+       * the final resolution. */
+      if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+        VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+        break;
+      }
+      VLOG(3) << "Rescheduling lower threshold.";
+      did_reschedule_on_idle = true;
+    }
+    else {
+      break;
+    }
+  }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+  render_scheduler_.set_denoiser_params(params);
+
+  if (!params.use) {
+    denoiser_.reset();
+    return;
+  }
+
+  if (denoiser_) {
+    const DenoiseParams old_denoiser_params = denoiser_->get_params();
+    if (old_denoiser_params.type == params.type) {
+      denoiser_->set_params(params);
+      return;
+    }
+  }
+
+  denoiser_ = Denoiser::create(device_, params);
+  denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+  if (!render_work.cryptomatte.postprocess) {
+    return;
+  }
+  VLOG(3) << "Perform cryptomatte work.";
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->cryptomatte_postproces();
+  });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+  if (!render_work.tile.denoise) {
+    return;
+  }
+
+  if (!denoiser_) {
+    /* Denoiser was not configured, so nothing to do here. */
+    return;
+  }
+
+  VLOG(3) << "Perform denoising work.";
+
+  const double start_time = time_dt();
+
+  RenderBuffers *buffer_to_denoise = nullptr;
+
+  unique_ptr<RenderBuffers> multi_device_buffers;
+  bool allow_inplace_modification = false;
+
+  if (path_trace_works_.size() == 1) {
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
+  else {
+    Device *denoiser_device = denoiser_->get_denoiser_device();
+    if (!denoiser_device) {
+      return;
+    }
+
+    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+    buffer_to_denoise = multi_device_buffers.get();
+
+    copy_to_render_buffers(multi_device_buffers.get());
+
+    allow_inplace_modification = true;
+  }
+
+  if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+                                buffer_to_denoise,
+                                get_num_samples_in_buffer(),
+                                allow_inplace_modification)) {
+    render_state_.has_denoised_result = true;
+  }
+
+  if (multi_device_buffers) {
+    multi_device_buffers->copy_from_device();
+    tbb::parallel_for_each(
+        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+        });
+  }
+
+  render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+  gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+  if (gpu_display_) {
+    gpu_display_->clear();
+  }
+}
+
+void PathTrace::draw()
+{
+  if (!gpu_display_) {
+    return;
+  }
+
+  did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+  if (!render_work.display.update) {
+    return;
+  }
+
+  if (!gpu_display_ && !tile_buffer_update_cb) {
+    VLOG(3) << "Ignore display update.";
+    return;
+  }
+
+  if (full_params_.width == 0 || full_params_.height == 0) {
+    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (tile_buffer_update_cb) {
+    VLOG(3) << "Invoke buffer update callback.";
+
+    tile_buffer_update_cb();
+  }
+
+  if (gpu_display_) {
+    VLOG(3) << "Perform copy to GPUDisplay work.";
+
+    const int resolution_divider = render_work.resolution_divider;
+    const int texture_width = max(1, full_params_.width / resolution_divider);
+    const int texture_height = max(1, full_params_.height / resolution_divider);
+    if (!gpu_display_->update_begin(texture_width, texture_height)) {
+      LOG(ERROR) << "Error beginning GPUDisplay update.";
+      return;
+    }
+
+    const PassMode pass_mode = render_work.display.use_denoised_result &&
+                                       render_state_.has_denoised_result ?
+                                   PassMode::DENOISED :
+                                   PassMode::NOISY;
+
+    /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+     * all works in parallel. */
+    const int num_samples = get_num_samples_in_buffer();
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+    }
+
+    gpu_display_->update_end();
+  }
+
+  render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+  static const int kLogLevel = 3;
+
+  if (!render_work.rebalance) {
+    return;
+  }
+
+  const int num_works = path_trace_works_.size();
+
+  if (num_works == 1) {
+    VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Perform rebalance work.";
+    VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].time_spent;
+    }
+  }
+
+  const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Calculated per-device weights for works:";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].weight;
+    }
+  }
+
+  if (!did_rebalance) {
+    VLOG(kLogLevel) << "Balance in path trace works did not change.";
+    render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+    return;
+  }
+
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+  big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+  copy_to_render_buffers(&big_tile_cpu_buffers);
+
+  render_state_.need_reset_params = true;
+  update_work_buffer_params_if_needed(render_work);
+
+  copy_from_render_buffers(&big_tile_cpu_buffers);
+
+  render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+  if (!render_work.tile.write) {
+    return;
+  }
+
+  VLOG(3) << "Write tile result.";
+
+  render_state_.tile_written = true;
+
+  const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+  /* Write render tile result, but only if not using tiled rendering.
+   *
+   * Tiles are written to a file during rendering, and written to the software at the end
+   * of rendering (wither when all tiles are finished, or when rendering was requested to be
+   * cancelled).
+   *
+   * Important thing is: tile should be written to the software via callback only once. */
+  if (!has_multiple_tiles) {
+    VLOG(3) << "Write tile result via buffer write callback.";
+    tile_buffer_write();
+  }
+
+  /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+   */
+  if (has_multiple_tiles) {
+    VLOG(3) << "Write tile result into .";
+    tile_buffer_write_to_disk();
+  }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+  if (!render_work.full.write) {
+    return;
+  }
+
+  VLOG(3) << "Handle full-frame render buffer work.";
+
+  if (!tile_manager_.has_written_tiles()) {
+    VLOG(3) << "No tiles on disk.";
+    return;
+  }
+
+  /* Make sure writing to the file is fully finished.
+   * This will include writing all possible missing tiles, ensuring validness of the file. */
+  tile_manager_.finish_write_tiles();
+
+  /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+   * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+   * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+  thread_scoped_lock lock(render_cancel_.mutex);
+
+  render_cancel_.is_requested = true;
+
+  while (render_cancel_.is_rendering) {
+    render_cancel_.condition.wait(lock);
+  }
+
+  render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+  if (render_cancel_.is_requested) {
+    return true;
+  }
+
+  if (progress_ != nullptr) {
+    if (progress_->get_cancel()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+  if (!tile_buffer_write_cb) {
+    return;
+  }
+
+  tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+  if (!tile_buffer_read_cb) {
+    return;
+  }
+
+  if (tile_buffer_read_cb()) {
+    tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->copy_render_buffers_to_device();
+    });
+  }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+  /* Sample count pass is required to support per-tile partial results stored in the file. */
+  DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+  const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+  if (num_rendered_samples == 0) {
+    /* The tile has zero samples, no need to write it. */
+    return;
+  }
+
+  /* Get access to the CPU-side render buffers of the current big tile. */
+  RenderBuffers *buffers;
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+  if (path_trace_works_.size() == 1) {
+    path_trace_works_[0]->copy_render_buffers_from_device();
+    buffers = path_trace_works_[0]->get_render_buffers();
+  }
+  else {
+    big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+    copy_to_render_buffers(&big_tile_cpu_buffers);
+
+    buffers = &big_tile_cpu_buffers;
+  }
+
+  if (!tile_manager_.write_tile(*buffers)) {
+    LOG(ERROR) << "Error writing tile to file.";
+  }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+  if (progress_ != nullptr) {
+    const int2 tile_size = get_render_tile_size();
+    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+    const int current_sample = render_work.path_trace.start_sample +
+                               render_work.path_trace.num_samples;
+    progress_->add_samples(num_samples_added, current_sample);
+  }
+
+  if (progress_update_cb) {
+    progress_update_cb();
+  }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+  if (progress_ != nullptr) {
+    progress_->set_status(status, substatus);
+  }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_to_render_buffers(render_buffers);
+                         });
+  render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+  render_buffers->copy_from_device();
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_from_render_buffers(render_buffers);
+                         });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+  if (full_frame_state_.render_buffers) {
+    /* Full-frame buffer is always allocated on CPU. */
+    return true;
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->copy_render_buffers_from_device()) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+  string result;
+
+  if (buffers.params.layer.size()) {
+    result += string(buffers.params.layer);
+  }
+
+  if (buffers.params.view.size()) {
+    if (!result.empty()) {
+      result += ", ";
+    }
+    result += string(buffers.params.view);
+  }
+
+  return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+  VLOG(3) << "Processing full frame buffer file " << filename;
+
+  progress_set_status("Reading full buffer from disk");
+
+  RenderBuffers full_frame_buffers(cpu_device_.get());
+
+  DenoiseParams denoise_params;
+  if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+    LOG(ERROR) << "Error reading tiles from file.";
+    return;
+  }
+
+  const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+  render_state_.has_denoised_result = false;
+
+  if (denoise_params.use) {
+    progress_set_status(layer_view_name, "Denoising");
+
+    /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+     *
+     * It will not conflict with the regular rendering as:
+     *  - Rendering is supposed to be finished here.
+     *  - The next rendering will go via Session's `run_update_for_next_iteration` which will
+     *    ensure proper denoiser is used. */
+    set_denoiser_params(denoise_params);
+
+    /* Number of samples doesn't matter too much, since the sampels count pass will be used. */
+    denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+    render_state_.has_denoised_result = true;
+  }
+
+  full_frame_state_.render_buffers = &full_frame_buffers;
+
+  progress_set_status(layer_view_name, "Finishing");
+
+  /* Write the full result pretending that there is a single tile.
+   * Requires some state change, but allows to use same communication API with the software. */
+  tile_buffer_write();
+
+  full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+  if (full_frame_state_.render_buffers) {
+    /* If the full-frame buffer is read from disk the number of samples is not used as there is a
+     * sample count pass for that in the buffer. Just avoid access to badly defined state of the
+     * path state. */
+    return 0;
+  }
+
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                       const PassAccessor::Destination &destination)
+{
+  if (full_frame_state_.render_buffers) {
+    return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                       const PassAccessor::Source &source)
+{
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(full_frame_state_.render_buffers->params.width,
+                     full_frame_state_.render_buffers->params.height);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(0, 0);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+  if (full_frame_state_.render_buffers) {
+    return full_frame_state_.render_buffers->params;
+  }
+
+  return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+  return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+  switch (type) {
+    case DEVICE_NONE:
+      return "None";
+
+    case DEVICE_CPU:
+      return "CPU";
+    case DEVICE_CUDA:
+      return "CUDA";
+    case DEVICE_OPTIX:
+      return "OptiX";
+    case DEVICE_DUMMY:
+      return "Dummy";
+    case DEVICE_MULTI:
+      return "Multi";
+  }
+
+  return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+  string full_description = device_info.description;
+
+  full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+  if (device_info.display_device) {
+    full_description += " (display)";
+  }
+
+  if (device_info.type == DEVICE_CPU) {
+    full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+  }
+
+  full_description += " [" + device_info.id + "]";
+
+  return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ *   Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ *   Message: Full First Device Description
+ *            Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+  string result = "\n" + message + ": ";
+  const string pad(message.length() + 2, ' ');
+
+  if (device_info.multi_devices.empty()) {
+    result += full_device_info_description(device_info) + "\n";
+    return result;
+  }
+
+  bool is_first = true;
+  for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+    if (!is_first) {
+      result += pad;
+    }
+
+    result += full_device_info_description(sub_device_info) + "\n";
+
+    is_first = false;
+  }
+
+  return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+  DeviceInfo device_info;
+  device_info.type = DEVICE_MULTI;
+
+  for (auto &&path_trace_work : path_trace_works) {
+    device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+  }
+
+  return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+  if (!denoiser) {
+    return "";
+  }
+
+  if (!denoiser->get_params().use) {
+    return "";
+  }
+
+  const Device *denoiser_device = denoiser->get_denoiser_device();
+  if (!denoiser_device) {
+    return "";
+  }
+
+  return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+  string result = "\nFull path tracing report\n";
+
+  result += path_trace_devices_report(path_trace_works_);
+  result += denoiser_device_report(denoiser_.get());
+
+  /* Report from the render scheduler, which includes:
+   * - Render mode (interactive, offline, headless)
+   * - Adaptive sampling and denoiser parameters
+   * - Breakdown of timing. */
+  result += render_scheduler_.full_report();
+
+  return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..78ca68c1198
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ *  - Kernel graph.
+ *  - Scheduling logic.
+ *  - Queues management.
+ *  - Adaptive stopping. */
+class PathTrace {
+ public:
+  /* Render scheduler is used to report timing information and access things like start/finish
+   * sample. */
+  PathTrace(Device *device,
+            Film *film,
+            DeviceScene *device_scene,
+            RenderScheduler &render_scheduler,
+            TileManager &tile_manager);
+  ~PathTrace();
+
+  /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+   * The progress is reported to the currently configure progress object (via `set_progress`). */
+  void load_kernels();
+
+  /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+   * more accurately which scene device memory may need to allocated on the host. */
+  void alloc_work_memory();
+
+  /* Check whether now it is a good time to reset rendering.
+   * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+   * render result. */
+  bool ready_to_reset();
+
+  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+  void device_free();
+
+  /* Set progress tracker.
+   * Used to communicate details about the progress to the outer world, check whether rendering is
+   * to be canceled.
+   *
+   * The path tracer writes to this object, and then at a convenient moment runs
+   * progress_update_cb() callback. */
+  void set_progress(Progress *progress);
+
+  /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+   * rendered (or until rendering is requested to be cancelled). */
+  void render(const RenderWork &render_work);
+
+  /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+   * convenient to have it here because then its easy to access render buffer. But the downside is
+   * that this adds too much of entities which can live separately with some clear API. */
+
+  /* Set denoiser parameters.
+   * Use this to configure the denoiser before rendering any samples. */
+  void set_denoiser_params(const DenoiseParams &params);
+
+  /* Set parameters used for adaptive sampling.
+   * Use this to configure the adaptive sampler before rendering any samples. */
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  /* Set GPU display which takes care of drawing the render result. */
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+  /* Clear the GPU display by filling it in with all zeroes. */
+  void clear_gpu_display();
+
+  /* Perform drawing of the current state of the GPUDisplay. */
+  void draw();
+
+  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+   * Used in cases like reset of render session.
+   *
+   * This is a blockign call, which returns as soon as there is no running `render_samples()` call.
+   */
+  void cancel();
+
+  /* Copy an entire render buffer to/from the path trace.  */
+
+  /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+   * the data will be copied to the device of the given render buffers. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr
+   * buffers and will be copied to all devices of the path trace. */
+  void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy render buffers of the big tile from the device to hsot.
+   * Return true if all copies are successful. */
+  bool copy_render_tile_from_device();
+
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
+
+  /* Get number of samples in the current big tile render buffers. */
+  int get_num_render_tile_samples() const;
+
+  /* Get pass data of the entire big tile.
+   * This call puts pass render result from all devices into the final pixels storage.
+   *
+   * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+   *
+   * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Check whether denoiser was run and denoised passes are available. */
+  bool has_denoised_result() const;
+
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+   * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
+
+  /* Get buffer parameters of the current tile.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  const BufferParams &get_render_tile_params() const;
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+  /* Callback which communicates an updates state of the render buffer of the current big tile.
+   * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+  function<void(void)> tile_buffer_update_cb;
+
+  /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */
+  function<void(void)> tile_buffer_write_cb;
+
+  /* Callback which initializes rendered buffer. Is called before pathtracing starts.
+   *
+   * This is used for baking. */
+  function<bool(void)> tile_buffer_read_cb;
+
+  /* Callback which is called to report current rendering progress.
+   *
+   * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+   * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+   * that the buffer is "uniformly" sampled at the moment of this callback). */
+  function<void(void)> progress_update_cb;
+
+ protected:
+  /* Actual implementation of the rendering pipeline.
+   * Calls steps in order, checking for the cancel to be requested inbetween.
+   *
+   * Is separate from `render()` to simplify dealing with the early outputs and keeping
+   * `render_cancel_` in the consistent state. */
+  void render_pipeline(RenderWork render_work);
+
+  /* Initialize kernel execution on all integrator queues. */
+  void render_init_kernel_execution();
+
+  /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+   * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+   */
+  void update_work_buffer_params_if_needed(const RenderWork &render_work);
+  void update_allocated_work_buffer_params();
+  void update_effective_work_buffer_params(const RenderWork &render_work);
+
+  /* Perform various steps of the render work.
+   *
+   * Note that some steps might modify the work, forcing some steps to happen within this iteration
+   * of rendering. */
+  void init_render_buffers(const RenderWork &render_work);
+  void path_trace(RenderWork &render_work);
+  void adaptive_sample(RenderWork &render_work);
+  void denoise(const RenderWork &render_work);
+  void cryptomatte_postprocess(const RenderWork &render_work);
+  void update_display(const RenderWork &render_work);
+  void rebalance(const RenderWork &render_work);
+  void write_tile_buffer(const RenderWork &render_work);
+  void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+  /* Get number of samples in the current state of the render buffers. */
+  int get_num_samples_in_buffer();
+
+  /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+   * soon as possible. */
+  bool is_cancel_requested();
+
+  /* Write the big tile render buffer via the write callback. */
+  void tile_buffer_write();
+
+  /* Read the big tile render buffer via the read callback. */
+  void tile_buffer_read();
+
+  /* Write current tile into the file on disk. */
+  void tile_buffer_write_to_disk();
+
+  /* Run the progress_update_cb callback if it is needed. */
+  void progress_update_if_needed(const RenderWork &render_work);
+
+  void progress_set_status(const string &status, const string &substatus = "");
+
+  /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+   * are configured this is a `MultiDevice`. */
+  Device *device_ = nullptr;
+
+  /* CPU device for creating temporary render buffers on the CPU side. */
+  unique_ptr<Device> cpu_device_;
+
+  DeviceScene *device_scene_;
+
+  RenderScheduler &render_scheduler_;
+  TileManager &tile_manager_;
+
+  unique_ptr<GPUDisplay> gpu_display_;
+
+  /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+   * device. */
+  vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+  /* Per-path trace work information needed for multi-device balancing. */
+  vector<WorkBalanceInfo> work_balance_infos_;
+
+  /* Render buffer parameters of the full frame and current big tile. */
+  BufferParams full_params_;
+  BufferParams big_tile_params_;
+
+  /* Denoiser which takes care of denoising the big tile. */
+  unique_ptr<Denoiser> denoiser_;
+
+  /* State which is common for all the steps of the render work.
+   * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+   * rendering the work. */
+  struct {
+    /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+     * value of the big tile parameters. */
+    bool need_reset_params = false;
+
+    /* Divider of the resolution for faster previews.
+     *
+     * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+     * think of render buffer in this case is as an over-allocated array: the resolution divider
+     * affects both resolution and stride as visible by the integrator kernels. */
+    int resolution_divider = 0;
+
+    /* Paramaters of the big tile with the current resolution divider applied. */
+    BufferParams effective_big_tile_params;
+
+    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    bool has_denoised_result = false;
+
+    /* Current tile has been written (to either disk or callback.
+     * Indicates that no more work will be done on this tile. */
+    bool tile_written = false;
+  } render_state_;
+
+  /* Progress object which is used to communicate sample progress. */
+  Progress *progress_;
+
+  /* Fields required for canceling render on demand, as quickly as possible. */
+  struct {
+    /* Indicates whether there is an on-going `render_samples()` call. */
+    bool is_rendering = false;
+
+    /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+    bool is_requested = false;
+
+    /* Synchronization between thread which does `render_samples()` and thread which does
+     * `cancel()`. */
+    thread_mutex mutex;
+    thread_condition_variable condition;
+  } render_cancel_;
+
+  /* Indicates whether a render result was drawn after latest session reset.
+   * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+  bool did_draw_after_reset_ = true;
+
+  /* State of the full frame processing and writing to the software. */
+  struct {
+    RenderBuffers *render_buffers = nullptr;
+  } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+                                                Film *film,
+                                                DeviceScene *device_scene,
+                                                bool *cancel_requested_flag)
+{
+  if (device->info.type == DEVICE_CPU) {
+    return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+  }
+
+  return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+                             Film *film,
+                             DeviceScene *device_scene,
+                             bool *cancel_requested_flag)
+    : device_(device),
+      film_(film),
+      device_scene_(device_scene),
+      buffers_(make_unique<RenderBuffers>(device)),
+      effective_buffer_params_(buffers_->params),
+      cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+  return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+                                                const BufferParams &effective_big_tile_params,
+                                                const BufferParams &effective_buffer_params)
+{
+  effective_full_params_ = effective_full_params;
+  effective_big_tile_params_ = effective_big_tile_params;
+  effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+  /* Assume if there are multiple works working on the same big tile none of the works gets the
+   * entire big tile to work on. */
+  return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+           effective_big_tile_params_.height == effective_buffer_params_.height &&
+           effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+           effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  copy_render_buffers_from_device();
+
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = buffers_->buffer.data();
+  float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+  memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = render_buffers->buffer.data() + offset_in_floats;
+  float *dst = buffers_->buffer.data();
+
+  memcpy(dst, src, data_size);
+
+  copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset = offset_y * width;
+
+  render_buffers_host_copy_denoised(
+      buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+  copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                           const PassAccessor::Destination &destination)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Destination slice_destination = destination;
+  slice_destination.offset += offset_y * width;
+
+  return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                           const PassAccessor::Source &source)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Source slice_source = source;
+  slice_source.offset += offset_y * width;
+
+  return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+  const KernelBackground &kbackground = device_scene_->data.background;
+
+  const BufferParams &params = buffers_->params;
+
+  const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = display_pass->type;
+  pass_access_info.offset = PASS_UNUSED;
+
+  if (pass_mode == PassMode::DENOISED) {
+    pass_access_info.mode = PassMode::DENOISED;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+  }
+
+  if (pass_access_info.offset == PASS_UNUSED) {
+    pass_access_info.mode = PassMode::NOISY;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+  }
+
+  pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+  return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+    const GPUDisplay *gpu_display) const
+{
+  PassAccessor::Destination destination(film_->get_display_pass());
+
+  const int2 display_texture_size = gpu_display->get_texture_size();
+  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+  destination.offset = texture_y * display_texture_size.x + texture_x;
+  destination.stride = display_texture_size.x;
+
+  return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..97b97f3d888
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+  struct RenderStatistics {
+    float occupancy = 1.0f;
+  };
+
+  /* Create path trace work which fits best the device.
+   *
+   * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
+   * possible. This could be, for rexample, request to cancel rendering on camera navigation in
+   * viewport. */
+  static unique_ptr<PathTraceWork> create(Device *device,
+                                          Film *film,
+                                          DeviceScene *device_scene,
+                                          bool *cancel_requested_flag);
+
+  virtual ~PathTraceWork();
+
+  /* Access the render buffers.
+   *
+   * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+   * correspond to the big tile size and relative device performance. */
+  RenderBuffers *get_render_buffers();
+
+  /* Set effective parameters of the big tile and the work itself. */
+  void set_effective_buffer_params(const BufferParams &effective_full_params,
+                                   const BufferParams &effective_big_tile_params,
+                                   const BufferParams &effective_buffer_params);
+
+  /* Check whether the big tile is being worked on by multiple path trace works. */
+  bool has_multiple_works() const;
+
+  /* Allocate working memory for execution. Must be called before init_execution(). */
+  virtual void alloc_work_memory(){};
+
+  /* Initialize execution of kernels.
+   * Will ensure that all device queues are initialized for execution.
+   *
+   * This method is to be called after any change in the scene. It is not needed to call it prior
+   * to an every call of the `render_samples()`. */
+  virtual void init_execution() = 0;
+
+  /* Render given number of samples as a synchronous blocking call.
+   * The samples are added to the render buffer associated with this work. */
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+  /* Copy render result from this work to the corresponding place of the GPU display.
+   *
+   * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+   * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+   * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+   * not used then this function will fall-back to the noisy pass instead. */
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) = 0;
+
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+  /* Copy data from/to given render buffers.
+   * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+   * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+  /* Notes:
+   * - Copies work's render buffer from the device.
+   * - Copies CPU-side buffer of the given buffer
+   * - Does not copy the buffer to its device. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Notes:
+   * - Does not copy given render buffers from the device.
+   * - Copies work's render buffer to its device. */
+  void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+   * given render buffers, leaving rest of the passes.
+   *
+   * Same notes about device copying aplies to this call as well. */
+  void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Copy render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool copy_render_buffers_from_device() = 0;
+  virtual bool copy_render_buffers_to_device() = 0;
+
+  /* Zero render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool zero_render_buffers() = 0;
+
+  /* Access pixels rendered by this work and copy them to the coresponding location in the
+   * destination.
+   *
+   * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+   * to update host-side data. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Perform convergence test on the render buffer, and filter the convergence mask.
+   * Returns number of active pixels (the ones which did not converge yet). */
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+  /* Run cryptomatte pass post-processing kernels. */
+  virtual void cryptomatte_postproces() = 0;
+
+  /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+   * possible, without waiting for any samples to be finished. */
+  inline bool is_cancel_requested() const
+  {
+    /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+     * threaded environment. */
+    return *cancel_requested_flag_;
+  }
+
+  /* Access to the device which is used to path trace this work on. */
+  Device *get_device() const
+  {
+    return device_;
+  }
+
+ protected:
+  PathTraceWork(Device *device,
+                Film *film,
+                DeviceScene *device_scene,
+                bool *cancel_requested_flag);
+
+  PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+  /* Get destination which offset and stride are configured so that writing to it will write to a
+   * proper location of GPU display texture, taking current tile and device slice into account. */
+  PassAccessor::Destination get_gpu_display_destination_template(
+      const GPUDisplay *gpu_display) const;
+
+  /* Device which will be used for path tracing.
+   * Note that it is an actual render device (and never is a multi-device). */
+  Device *device_;
+
+  /* Film is used to access display pass configuration for GPU display update.
+   * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+  Film *film_;
+
+  /* Device side scene storage, that may be used for integrator logic. */
+  DeviceScene *device_scene_;
+
+  /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+   * tile which is being rendered by this work.
+   * It also defines possible subset of a big tile in the case of multi-device rendering. */
+  unique_ptr<RenderBuffers> buffers_;
+
+  /* Effective parameters of the full, big tile, and current work render buffer.
+   * The latter might be different from buffers_->params when there is a resolution divider
+   * involved. */
+  BufferParams effective_full_params_;
+  BufferParams effective_big_tile_params_;
+  BufferParams effective_buffer_params_;
+
+  bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+  /* TODO: limit this to number of threads of CPU device, it may be smaller than
+   * the system number of threads when we reduce the number of CPU threads in
+   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+  return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  const int thread_index = tbb::this_task_arena::current_thread_index();
+  DCHECK_GE(thread_index, 0);
+  DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+  return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      kernels_(*(device->get_cpu_kernels()))
+{
+  DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+  /* Cache per-thread kernel globals. */
+  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  const int64_t image_width = effective_buffer_params_.width;
+  const int64_t image_height = effective_buffer_params_.height;
+  const int64_t total_pixels_num = image_width * image_height;
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
+  }
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+      if (is_cancel_requested()) {
+        return;
+      }
+
+      const int y = work_index / image_width;
+      const int x = work_index - y * image_width;
+
+      KernelWorkTile work_tile;
+      work_tile.x = effective_buffer_params_.full_x + x;
+      work_tile.y = effective_buffer_params_.full_y + y;
+      work_tile.w = 1;
+      work_tile.h = 1;
+      work_tile.start_sample = start_sample;
+      work_tile.num_samples = 1;
+      work_tile.offset = effective_buffer_params_.offset;
+      work_tile.stride = effective_buffer_params_.stride;
+
+      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+    });
+  });
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
+  }
+
+  statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                                    const KernelWorkTile &work_tile,
+                                                    const int samples_num)
+{
+  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+  const bool has_bake = device_scene_->data.bake.use;
+
+  IntegratorStateCPU integrator_states[2] = {};
+
+  IntegratorStateCPU *state = &integrator_states[0];
+  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+  KernelWorkTile sample_work_tile = work_tile;
+  float *render_buffer = buffers_->buffer.data();
+
+  for (int sample = 0; sample < samples_num; ++sample) {
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    if (has_bake) {
+      if (!kernels_.integrator_init_from_bake(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+    else {
+      if (!kernels_.integrator_init_from_camera(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+
+    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+    if (has_shadow_catcher) {
+      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+    }
+
+    ++sample_work_tile.start_sample;
+  }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  half4 *rgba_half = gpu_display->map_texture_buffer();
+  if (!rgba_half) {
+    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+     * some implementations of GPUDisplay which can not map memory? */
+    return;
+  }
+
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.pixels_half_rgba = rgba_half;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+  });
+
+  gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+  return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+  buffers_->buffer.copy_to_device();
+  return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+  buffers_->zero();
+  return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  uint num_active_pixels = 0;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(full_y, full_y + height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+      bool row_converged = true;
+      uint num_row_pixels_active = 0;
+      for (int x = 0; x < width; ++x) {
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
+      }
+
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+      if (!row_converged) {
+        kernels_.adaptive_sampling_filter_x(
+            kernel_globals, render_buffer, y, full_x, width, offset, stride);
+      }
+    });
+  });
+
+  if (num_active_pixels) {
+    local_arena.execute([&]() {
+      tbb::parallel_for(full_x, full_x + width, [&](int x) {
+        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+        kernels_.adaptive_sampling_filter_y(
+            kernel_globals, render_buffer, x, full_y, height, offset, stride);
+      });
+    });
+  }
+
+  return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+  PathTraceWorkCPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  /* Core path tracing routine. Renders given work time on the given queue. */
+  void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                    const KernelWorkTile &work_tile,
+                                    const int samples_num);
+
+  /* CPU kernels. */
+  const CPUKernels &kernels_;
+
+  /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+   *
+   * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+   * accessing it, but some "localization" is required to decouple from kernel globals stored
+   * on the device level. */
+  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..10baf869aa6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      queue_(device->gpu_queue_create()),
+      integrator_state_soa_kernel_features_(0),
+      integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+      integrator_shader_raytrace_sort_counter_(
+          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_catcher_path_index_(
+          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+      queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+      num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+      work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+      gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+      max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+      min_num_active_paths_(queue_->num_concurrent_busy_states()),
+      max_active_path_index_(0)
+{
+  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+   * path compaction which relies on the fact that regeneration does not happen sooner than half of
+   * the states are available again. */
+  min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+  /* IntegrateState allocated as structure of arrays. */
+
+  /* Check if we already allocated memory for the required features. */
+  const uint kernel_features = device_scene_->data.kernel_features;
+  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+    return;
+  }
+  integrator_state_soa_kernel_features_ = kernel_features;
+
+  /* Allocate a device only memory buffer before for each struct member, and then
+   * write the pointers into a struct that resides in constant memory.
+   *
+   * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && \
+      (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_END(name) \
+  break; \
+  }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+  if (array_index == array_size - 1) { \
+    break; \
+  } \
+  }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+  if (integrator_queue_counter_.size() == 0) {
+    integrator_queue_counter_.alloc(1);
+    integrator_queue_counter_.zero_to_device();
+    integrator_queue_counter_.copy_from_device();
+    integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+                                              integrator_queue_counter_.device_pointer;
+  }
+
+  /* Allocate data for active path index arrays. */
+  if (num_queued_paths_.size() == 0) {
+    num_queued_paths_.alloc(1);
+    num_queued_paths_.zero_to_device();
+  }
+
+  if (queued_paths_.size() == 0) {
+    queued_paths_.alloc(max_num_paths_);
+    /* TODO: this could be skip if we had a function to just allocate on device. */
+    queued_paths_.zero_to_device();
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+  /* Allocate arrays for shader sorting. */
+  const int max_shaders = device_scene_->data.max_shaders;
+  if (integrator_shader_sort_counter_.size() < max_shaders) {
+    integrator_shader_sort_counter_.alloc(max_shaders);
+    integrator_shader_sort_counter_.zero_to_device();
+
+    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+    integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+    return;
+  }
+
+  integrator_next_shadow_catcher_path_index_.alloc(1);
+  /* TODO(sergey): Use queue? */
+  integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+  integrator_state_gpu_.next_shadow_catcher_path_index =
+      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+  alloc_integrator_soa();
+  alloc_integrator_queue();
+  alloc_integrator_sorting();
+  alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+  queue_->init_execution();
+
+  /* Copy to device side struct in constant memory. */
+  device_->const_copy_to(
+      "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+   * add more work (because tiles are smaller, so there is higher chance that more paths will
+   * become busy after adding new tiles). This is especially important for the shadow catcher which
+   * schedules work in halves of available number of paths. */
+  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+  work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+  enqueue_reset();
+
+  int num_iterations = 0;
+  uint64_t num_busy_accum = 0;
+
+  /* TODO: set a hard limit in case of undetected kernel failures? */
+  while (true) {
+    /* Enqueue work from the scheduler, on start or when there are not enough
+     * paths to keep the device occupied. */
+    bool finished;
+    if (enqueue_work_tiles(finished)) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    /* Stop if no more work remaining. */
+    if (finished) {
+      break;
+    }
+
+    /* Enqueue on of the path iteration kernels. */
+    if (enqueue_path_iteration()) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    num_busy_accum += get_num_active_paths();
+    ++num_iterations;
+  }
+
+  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int max_num_queued = 0;
+  DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    if (queue_counter->num_queued[i] > max_num_queued) {
+      kernel = (DeviceKernel)i;
+      max_num_queued = queue_counter->num_queued[i];
+    }
+  }
+
+  return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+  void *args[] = {&max_num_paths_};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+  queue_->zero_to_device(integrator_queue_counter_);
+  queue_->zero_to_device(integrator_shader_sort_counter_);
+  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+   * counter on the host side because `zero_to_device()` is not doing it. */
+  if (integrator_queue_counter_.host_pointer) {
+    memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+  }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+  /* Find kernel to execute, with max number of queued paths. */
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_active_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    num_active_paths += queue_counter->num_queued[i];
+  }
+
+  if (num_active_paths == 0) {
+    return false;
+  }
+
+  /* Find kernel to execute, with max number of queued paths. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel == DEVICE_KERNEL_NUM) {
+    return false;
+  }
+
+  /* Finish shadows before potentially adding more shadow rays. We can only
+   * store one shadow ray in the integrator state. */
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+      return true;
+    }
+    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+      return true;
+    }
+  }
+
+  /* Schedule kernel with maximum number of queued items. */
+  enqueue_path_iteration(kernel);
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+  void *d_path_index = (void *)NULL;
+
+  /* Create array of path indices for which this kernel is queued to be executed. */
+  int work_size = max_active_path_index_;
+
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  int num_queued = queue_counter->num_queued[kernel];
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    /* Compute array of active paths, sorted by shader. */
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+  }
+  else if (num_queued < work_size) {
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+      /* Compute array of active shadow paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+    }
+    else {
+      /* Compute array of active paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+    }
+  }
+
+  DCHECK_LE(work_size, max_num_paths_);
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+      /* Ray intersection kernels with integrator state. */
+      void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+      /* Shading kernels with integrator state and render buffer. */
+      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+                 << " used for path iteration, should never happen.";
+      break;
+  }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+  assert(d_counter != nullptr);
+
+  /* Compute prefix sum of number of active paths with each shader. */
+  {
+    const int work_size = 1;
+    int max_shaders = device_scene_->data.max_shaders;
+    void *args[] = {&d_counter, &max_shaders};
+    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  /* Launch kernel to fill the active paths arrays. */
+  {
+    /* TODO: this could be smaller for terminated paths based on amount of work we want
+     * to schedule. */
+    const int work_size = max_active_path_index_;
+
+    void *d_queued_paths = (void *)queued_paths_.device_pointer;
+    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+    void *args[] = {const_cast<int *>(&work_size),
+                    &d_queued_paths,
+                    &d_num_queued_paths,
+                    &d_counter,
+                    &d_queued_kernel};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+
+  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+    queue_->zero_to_device(integrator_shader_sort_counter_);
+  }
+  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  else {
+    assert(0);
+  }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  const int work_size = max_active_path_index_;
+  void *d_queued_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {
+      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+  queue_->zero_to_device(num_queued_paths_);
+  queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+  if (num_active_paths == 0) {
+    max_active_path_index_ = 0;
+  }
+
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
+  if (max_active_path_index_ == num_active_paths) {
+    return;
+  }
+
+  void *d_compact_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+  /* Create array with terminated paths that we can write to. */
+  {
+    /* TODO: can the work size be reduced here? */
+    int offset = num_active_paths;
+    int work_size = num_active_paths;
+    void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+  }
+
+  /* Create array of paths that we need to compact, where the path index is bigger
+   * than the number of active paths. */
+  {
+    int work_size = max_active_path_index_;
+    void *args[] = {
+        &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+  }
+
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  int num_compact_paths = num_queued_paths_.data()[0];
+
+  /* Move paths into gaps. */
+  if (num_compact_paths > 0) {
+    int work_size = num_compact_paths;
+    int active_states_offset = 0;
+    int terminated_states_offset = num_active_paths;
+    void *args[] = {
+        &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+  }
+
+  queue_->synchronize();
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+  /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+   * wavefront of the existing and newely added paths. */
+  /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+   * performance. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+    return false;
+  }
+
+  int num_active_paths = get_num_active_paths();
+
+  /* Don't schedule more work if cancelling. */
+  if (is_cancel_requested()) {
+    if (num_active_paths == 0) {
+      finished = true;
+    }
+    return false;
+  }
+
+  finished = false;
+
+  vector<KernelWorkTile> work_tiles;
+
+  int max_num_camera_paths = max_num_paths_;
+  int num_predicted_splits = 0;
+
+  if (has_shadow_catcher()) {
+    /* When there are shadow catchers in the scene bounce from them will split the state. So we
+     * make sure there is enough space in the path states array to fit split states.
+     *
+     * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+     * that all the new paths can be split.
+     *
+     * Note that it is possible that some of the current states can still split, so need to make
+     * sure there is enough space for them as well. */
+
+    /* Number of currently in-flight states which can still split. */
+    const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+    const int num_available_paths = max_num_paths_ - num_active_paths;
+    const int num_new_paths = num_available_paths / 2;
+    max_num_camera_paths = max(num_active_paths,
+                               num_active_paths + num_new_paths - num_scheduled_possible_split);
+    num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+  }
+
+  /* Schedule when we're out of paths or there are too few paths to keep the
+   * device occupied. */
+  int num_paths = num_active_paths;
+  if (num_paths == 0 || num_paths < min_num_active_paths_) {
+    /* Get work tiles until the maximum number of path is reached. */
+    while (num_paths < max_num_camera_paths) {
+      KernelWorkTile work_tile;
+      if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+        work_tiles.push_back(work_tile);
+        num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* If we couldn't get any more tiles, we're done. */
+    if (work_tiles.size() == 0 && num_paths == 0) {
+      finished = true;
+      return false;
+    }
+  }
+
+  /* Initialize paths from work tiles. */
+  if (work_tiles.size() == 0) {
+    return false;
+  }
+
+  /* Compact state array when number of paths becomes small relative to the
+   * known maximum path index, which makes computing active index arrays slow. */
+  compact_states(num_active_paths);
+
+  if (has_shadow_catcher()) {
+    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+  }
+
+  enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+                                                      DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+                     work_tiles.data(),
+                     work_tiles.size(),
+                     num_active_paths,
+                     num_predicted_splits);
+
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+                                          const KernelWorkTile work_tiles[],
+                                          const int num_work_tiles,
+                                          const int num_active_paths,
+                                          const int num_predicted_splits)
+{
+  /* Copy work tiles to device. */
+  if (work_tiles_.size() < num_work_tiles) {
+    work_tiles_.alloc(num_work_tiles);
+  }
+
+  int path_index_offset = num_active_paths;
+  int max_tile_work_size = 0;
+  for (int i = 0; i < num_work_tiles; i++) {
+    KernelWorkTile &work_tile = work_tiles_.data()[i];
+    work_tile = work_tiles[i];
+
+    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+    work_tile.path_index_offset = path_index_offset;
+    work_tile.work_size = tile_work_size;
+
+    path_index_offset += tile_work_size;
+
+    max_tile_work_size = max(max_tile_work_size, tile_work_size);
+  }
+
+  queue_->copy_to_device(work_tiles_);
+
+  void *d_work_tiles = (void *)work_tiles_.device_pointer;
+  void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+  /* Launch kernel. */
+  void *args[] = {&d_work_tiles,
+                  const_cast<int *>(&num_work_tiles),
+                  &d_render_buffer,
+                  const_cast<int *>(&max_tile_work_size)};
+
+  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+  max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+  /* TODO: this is wrong, does not account for duplicates with shadow! */
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    DCHECK_GE(queue_counter->num_queued[i], 0)
+        << "Invalid number of queued states for kernel "
+        << device_kernel_as_string(static_cast<DeviceKernel>(i));
+    num_paths += queue_counter->num_queued[i];
+  }
+
+  return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+  /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+   * that the GPUDisplay has a single texture:
+   *
+   *   CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+   *   attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+   *   otherwise one would run into a conflict of where the source of truth is. */
+  if (has_multiple_works()) {
+    return false;
+  }
+
+  if (!interop_use_checked_) {
+    Device *device = queue_->device;
+    interop_use_ = device->should_use_graphics_interop();
+
+    if (interop_use_) {
+      VLOG(2) << "Will be using graphics interop GPU display update.";
+    }
+    else {
+      VLOG(2) << "Will be using naive GPU display update.";
+    }
+
+    interop_use_checked_ = true;
+  }
+
+  return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  if (device_->have_error()) {
+    /* Don't attempt to update GPU display if the device has errors: the error state will make
+     * wrong decisions to happen about interop, causing more chained bugs. */
+    return;
+  }
+
+  if (!buffers_->buffer.device_pointer) {
+    LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+    return;
+  }
+
+  if (should_use_graphics_interop()) {
+    if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+      return;
+    }
+
+    /* If error happens when trying to use graphics interop fallback to the native implementation
+     * and don't attempt to use interop for the further updates. */
+    interop_use_ = false;
+  }
+
+  copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+                                                 PassMode pass_mode,
+                                                 int num_samples)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int final_width = buffers_->params.width;
+  const int final_height = buffers_->params.height;
+
+  const int texture_x = full_x - effective_full_params_.full_x;
+  const int texture_y = full_y - effective_full_params_.full_y;
+
+  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+   *
+   * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+   * change of the resolution divider. However, if the display becomes smaller, shrink the
+   * allocated memory as well. */
+  if (gpu_display_rgba_half_.data_width != final_width ||
+      gpu_display_rgba_half_.data_height != final_height) {
+    gpu_display_rgba_half_.alloc(final_width, final_height);
+    /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+     * transfering zeroes to the device. */
+    queue_->zero_to_device(gpu_display_rgba_half_);
+  }
+
+  PassAccessor::Destination destination(film_->get_display_pass());
+  destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  gpu_display_rgba_half_.copy_from_device();
+
+  gpu_display->copy_pixels_to_texture(
+      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  if (!device_graphics_interop_) {
+    device_graphics_interop_ = queue_->graphics_interop_create();
+  }
+
+  const DeviceGraphicsInteropDestination graphics_interop_dst =
+      gpu_display->graphics_interop_get();
+  device_graphics_interop_->set_destination(graphics_interop_dst);
+
+  const device_ptr d_rgba_half = device_graphics_interop_->map();
+  if (!d_rgba_half) {
+    return false;
+  }
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.d_pixels_half_rgba = d_rgba_half;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  device_graphics_interop_->unmap();
+
+  return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+  if (!device_graphics_interop_) {
+    return;
+  }
+  gpu_display->graphics_interop_activate();
+  device_graphics_interop_ = nullptr;
+  gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+  const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+  pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+  if (num_active_pixels) {
+    enqueue_adaptive_sampling_filter_x();
+    enqueue_adaptive_sampling_filter_y();
+    queue_->synchronize();
+  }
+
+  return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+  device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+  num_active_pixels.alloc(1);
+
+  queue_->zero_to_device(num_active_pixels);
+
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&effective_buffer_params_.full_x),
+                  const_cast<int *>(&effective_buffer_params_.full_y),
+                  const_cast<int *>(&effective_buffer_params_.width),
+                  const_cast<int *>(&effective_buffer_params_.height),
+                  &threshold,
+                  &reset,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride,
+                  &num_active_pixels.device_pointer};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+  queue_->copy_from_device(num_active_pixels);
+  queue_->synchronize();
+
+  return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+  const int work_size = effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+  const int work_size = effective_buffer_params_.width;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&work_size),
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+  queue_->copy_from_device(buffers_->buffer);
+
+  /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+  return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+  queue_->copy_to_device(buffers_->buffer);
+
+  /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+   * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+   * which will perform synchronization as needed. */
+
+  return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+  queue_->zero_to_device(buffers_->buffer);
+
+  return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+  return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+  if (max_active_path_index_ == 0) {
+    return 0;
+  }
+
+  if (!has_shadow_catcher()) {
+    return 0;
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  const int work_size = max_active_path_index_;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+  PathTraceWorkGPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void alloc_work_memory() override;
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  void alloc_integrator_soa();
+  void alloc_integrator_queue();
+  void alloc_integrator_sorting();
+  void alloc_integrator_path_split();
+
+  /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+  DeviceKernel get_most_queued_kernel() const;
+
+  void enqueue_reset();
+
+  bool enqueue_work_tiles(bool &finished);
+  void enqueue_work_tiles(DeviceKernel kernel,
+                          const KernelWorkTile work_tiles[],
+                          const int num_work_tiles,
+                          const int num_active_paths,
+                          const int num_predicted_splits);
+
+  bool enqueue_path_iteration();
+  void enqueue_path_iteration(DeviceKernel kernel);
+
+  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+  void compact_states(const int num_active_paths);
+
+  int get_num_active_paths();
+
+  /* Check whether graphics interop can be used for the GPUDisplay update. */
+  bool should_use_graphics_interop();
+
+  /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+   * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+  void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+   * functionality, avoiding copy of pixels to the host. */
+  bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Synchronously run film conversion kernel and store display result in the given destination. */
+  void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                   PassMode pass_mode,
+                                   int num_samples);
+
+  int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+  void enqueue_adaptive_sampling_filter_x();
+  void enqueue_adaptive_sampling_filter_y();
+
+  bool has_shadow_catcher() const;
+
+  /* Count how many currently scheduled paths can still split. */
+  int shadow_catcher_count_possible_splits();
+
+  /* Integrator queue. */
+  unique_ptr<DeviceQueue> queue_;
+
+  /* Scheduler which gives work to path tracing threads. */
+  WorkTileScheduler work_tile_scheduler_;
+
+  /* Integrate state for paths. */
+  IntegratorStateGPU integrator_state_gpu_;
+  /* SoA arrays for integrator state. */
+  vector<unique_ptr<device_memory>> integrator_state_soa_;
+  uint integrator_state_soa_kernel_features_;
+  /* Keep track of number of queued kernels. */
+  device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+  /* Shader sorting. */
+  device_vector<int> integrator_shader_sort_counter_;
+  device_vector<int> integrator_shader_raytrace_sort_counter_;
+  /* Path split. */
+  device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+  /* Temporary buffer to get an array of queued path for a particular kernel. */
+  device_vector<int> queued_paths_;
+  device_vector<int> num_queued_paths_;
+
+  /* Temporary buffer for passing work tiles to kernel. */
+  device_vector<KernelWorkTile> work_tiles_;
+
+  /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+   * available. Is allocated on-demand. */
+  device_vector<half4> gpu_display_rgba_half_;
+
+  unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+  /* Cached result of device->should_use_graphics_interop(). */
+  bool interop_use_checked_ = false;
+  bool interop_use_ = false;
+
+  /* Maximum number of concurrent integrator states. */
+  int max_num_paths_;
+
+  /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+   * this value more work will be scheduled. */
+  int min_num_active_paths_;
+
+  /* Maximum path index, effective number of paths used may be smaller than
+   * the size of the integrator_state_ buffer so can avoid iterating over the
+   * full buffer. */
+  int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..4eb1dd941f9
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+    : headless_(params.headless),
+      background_(params.background),
+      pixel_size_(params.pixel_size),
+      tile_manager_(tile_manager),
+      default_start_resolution_divider_(pixel_size_ * 8)
+{
+  use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+  need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+  need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+  return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+  denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+  return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+  start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+  return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+  num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+  return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+  time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+  return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+  DCHECK_GT(get_num_rendered_samples(), 0);
+
+  return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+  return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+  buffer_params_ = buffer_params;
+
+  update_start_resolution_divider();
+
+  set_num_samples(num_samples);
+
+  /* In background mode never do lower resolution render preview, as it is not really supported
+   * by the software. */
+  if (background_) {
+    state_.resolution_divider = 1;
+  }
+  else {
+    /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+     * first and then initialized render work. */
+    state_.resolution_divider = start_resolution_divider_ * 2;
+  }
+
+  state_.num_rendered_samples = 0;
+  state_.last_display_update_time = 0.0;
+  state_.last_display_update_sample = -1;
+
+  state_.last_rebalance_time = 0.0;
+  state_.num_rebalance_requested = 0;
+  state_.num_rebalance_changes = 0;
+  state_.last_rebalance_changed = false;
+  state_.need_rebalance_at_next_work = false;
+
+  /* TODO(sergey): Choose better initial value. */
+  /* NOTE: The adaptive sampling settings might not be available here yet. */
+  state_.adaptive_sampling_threshold = 0.4f;
+
+  state_.last_work_tile_was_denoised = false;
+  state_.tile_result_was_written = false;
+  state_.postprocess_work_scheduled = false;
+  state_.full_frame_work_scheduled = false;
+  state_.full_frame_was_written = false;
+
+  state_.path_trace_finished = false;
+
+  state_.start_render_time = 0.0;
+  state_.end_render_time = 0.0;
+  state_.time_limit_reached = false;
+
+  state_.occupancy_num_samples = 0;
+  state_.occupancy = 1.0f;
+
+  first_render_time_.path_trace_per_sample = 0.0;
+  first_render_time_.denoise_time = 0.0;
+  first_render_time_.display_update_time = 0.0;
+
+  path_trace_time_.reset();
+  denoise_time_.reset();
+  adaptive_filter_time_.reset();
+  display_update_time_.reset();
+  rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+  reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (render_work_reschedule_on_idle(render_work)) {
+    return true;
+  }
+
+  state_.path_trace_finished = true;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+  if (!use_progressive_noise_floor_) {
+    return false;
+  }
+
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (adaptive_sampling_.use) {
+    if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+      state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+                                               adaptive_sampling_.threshold);
+
+      render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+      render_work.adaptive_sampling.reset = true;
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+  VLOG(3) << "Schedule work for cancel.";
+
+  /* Un-schedule samples: they will not be rendered and should not be counted. */
+  state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+  const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+  /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and
+   * denoising.
+   * However, need to preserve write requests, since those will not be possible to recover and
+   * writes are only to happen once. */
+  const bool tile_write = render_work.tile.write;
+  const bool full_write = render_work.full.write;
+
+  render_work = RenderWork();
+
+  render_work.tile.write = tile_write;
+  render_work.full.write = full_write;
+
+  /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+   * got cancelled. */
+  if (!state_.tile_result_was_written && has_rendered_samples) {
+    render_work.tile.write = true;
+  }
+
+  if (!state_.full_frame_was_written) {
+    render_work.full.write = true;
+  }
+
+  /* Update current tile, but only if any sample was rendered.
+   * Allows to have latest state of tile visible while full buffer is being processed.
+   *
+   * Note that if there are no samples in the current tile its render buffer might have pixels
+   * remained from previous state.
+   *
+   * If the full result was written, then there is no way any updates were made to the render
+   * buffers. And the buffers might have been freed from the device, so display update is not
+   * possible. */
+  if (has_rendered_samples && !state_.full_frame_was_written) {
+    render_work.display.update = true;
+  }
+}
+
+bool RenderScheduler::done() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (state_.path_trace_finished || state_.time_limit_reached) {
+    return true;
+  }
+
+  return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+  check_time_limit_reached();
+
+  const double time_now = time_dt();
+
+  if (done()) {
+    RenderWork render_work;
+    render_work.resolution_divider = state_.resolution_divider;
+
+    if (!set_postprocess_render_work(&render_work)) {
+      set_full_frame_render_work(&render_work);
+    }
+
+    if (!render_work) {
+      state_.end_render_time = time_now;
+    }
+
+    update_state_for_render_work(render_work);
+
+    return render_work;
+  }
+
+  RenderWork render_work;
+
+  if (state_.resolution_divider != pixel_size_) {
+    state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+    state_.num_rendered_samples = 0;
+    state_.last_display_update_sample = -1;
+  }
+
+  render_work.resolution_divider = state_.resolution_divider;
+
+  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+  /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+  render_work.rebalance = work_need_rebalance();
+
+  /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+   * samples are rendered. */
+  state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+  render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+  render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+  render_work.adaptive_sampling.reset = false;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.tile.write = done();
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  if (done()) {
+    set_postprocess_render_work(&render_work);
+  }
+
+  update_state_for_render_work(render_work);
+
+  return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+  const double time_now = time_dt();
+
+  if (render_work.rebalance) {
+    state_.last_rebalance_time = time_now;
+    ++state_.num_rebalance_requested;
+  }
+
+  /* A fallback display update time, for the case there is an error of display update, or when
+   * there is no display at all. */
+  if (render_work.display.update) {
+    state_.last_display_update_time = time_now;
+    state_.last_display_update_sample = state_.num_rendered_samples;
+  }
+
+  state_.last_work_tile_was_denoised = render_work.tile.denoise;
+  state_.tile_result_was_written |= render_work.tile.write;
+  state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+  if (state_.postprocess_work_scheduled) {
+    return false;
+  }
+  state_.postprocess_work_scheduled = true;
+
+  bool any_scheduled = false;
+
+  if (need_schedule_cryptomatte_) {
+    render_work->cryptomatte.postprocess = true;
+    any_scheduled = true;
+  }
+
+  if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+    render_work->tile.denoise = true;
+    any_scheduled = true;
+  }
+
+  if (!state_.tile_result_was_written) {
+    render_work->tile.write = true;
+    any_scheduled = true;
+  }
+
+  if (any_scheduled) {
+    render_work->display.update = true;
+  }
+
+  return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+  if (state_.full_frame_work_scheduled) {
+    return;
+  }
+
+  if (!tile_manager_.has_multiple_tiles()) {
+    /* There is only single tile, so all work has been performed already. */
+    return;
+  }
+
+  if (!tile_manager_.done()) {
+    /* There are still tiles to be rendered. */
+    return;
+  }
+
+  if (state_.full_frame_was_written) {
+    return;
+  }
+
+  state_.full_frame_work_scheduled = true;
+
+  render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+  if (render_work.resolution_divider == 1) {
+    return time;
+  }
+
+  const double resolution_divider_sq = render_work.resolution_divider *
+                                       render_work.resolution_divider;
+  return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+  /* Start counting render time when rendering samples at their final resolution.
+   *
+   * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+   * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+   * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+   * expected to happen in the first work. */
+  if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+      render_work.path_trace.start_sample == get_start_sample()) {
+    state_.start_render_time = time_dt();
+  }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+                                             double time,
+                                             bool is_cancelled)
+{
+  path_trace_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.path_trace_per_sample = final_time_approx /
+                                               render_work.path_trace.num_samples;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    path_trace_time_.reset_average();
+  }
+
+  path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+  state_.occupancy_num_samples = render_work.path_trace.num_samples;
+  state_.occupancy = occupancy;
+  VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+                                                  double time,
+                                                  bool is_cancelled)
+{
+  adaptive_filter_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_report_reset_average(render_work)) {
+    adaptive_filter_time_.reset_average();
+  }
+
+  adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average adaptive sampling filter  time: " << adaptive_filter_time_.get_average()
+          << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+  denoise_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.denoise_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    denoise_time_.reset_average();
+  }
+
+  denoise_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+  display_update_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.display_update_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    display_update_time_.reset_average();
+  }
+
+  display_update_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+  /* Move the display update moment further in time, so that logic which checks when last update
+   * did happen have more reliable point in time (without path tracing and denoising parts of the
+   * render work). */
+  state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+                                            double time,
+                                            bool balance_changed)
+{
+  rebalance_time_.add_wall(time);
+
+  if (work_report_reset_average(render_work)) {
+    rebalance_time_.reset_average();
+  }
+
+  rebalance_time_.add_average(time);
+
+  if (balance_changed) {
+    ++state_.num_rebalance_changes;
+  }
+
+  state_.last_rebalance_changed = balance_changed;
+
+  VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+  const double render_wall_time = state_.end_render_time - state_.start_render_time;
+  const int num_rendered_samples = get_num_rendered_samples();
+
+  string result = "\nRender Scheduler Summary\n\n";
+
+  {
+    string mode;
+    if (headless_) {
+      mode = "Headless";
+    }
+    else if (background_) {
+      mode = "Background";
+    }
+    else {
+      mode = "Interactive";
+    }
+    result += "Mode: " + mode + "\n";
+  }
+
+  result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+            to_string(buffer_params_.height) + "\n";
+
+  result += "\nAdaptive sampling:\n";
+  result += "  Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+  if (adaptive_sampling_.use) {
+    result += "  Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+    result += "  Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+    result += "  Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+  }
+
+  result += "\nDenoiser:\n";
+  result += "  Use: " + string_from_bool(denoiser_params_.use) + "\n";
+  if (denoiser_params_.use) {
+    result += "  Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+    result += "  Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+    string passes = "Color";
+    if (denoiser_params_.use_pass_albedo) {
+      passes += ", Albedo";
+    }
+    if (denoiser_params_.use_pass_normal) {
+      passes += ", Normal";
+    }
+
+    result += "  Passes: " + passes + "\n";
+  }
+
+  if (state_.num_rebalance_requested) {
+    result += "\nRebalancer:\n";
+    result += "  Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+              "\n";
+    result += "  Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+              "\n";
+  }
+
+  result += "\nTime (in seconds):\n";
+  result += string_printf("  %20s %20s %20s\n", "", "Wall", "Average");
+  result += string_printf("  %20s %20f %20f\n",
+                          "Path Tracing",
+                          path_trace_time_.get_wall(),
+                          path_trace_time_.get_average());
+
+  if (adaptive_sampling_.use) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Adaptive Filter",
+                            adaptive_filter_time_.get_wall(),
+                            adaptive_filter_time_.get_average());
+  }
+
+  if (denoiser_params_.use) {
+    result += string_printf(
+        "  %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+  }
+
+  result += string_printf("  %20s %20f %20f\n",
+                          "Display Update",
+                          display_update_time_.get_wall(),
+                          display_update_time_.get_average());
+
+  if (state_.num_rebalance_requested) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Rebalance",
+                            rebalance_time_.get_wall(),
+                            rebalance_time_.get_average());
+  }
+
+  const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+                            denoise_time_.get_wall() + display_update_time_.get_wall();
+  result += "\n  Total: " + to_string(total_time) + "\n";
+
+  result += string_printf(
+      "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+  /* When adaptive sampling is used the average time becomes meaningless, because different samples
+   * will likely render different number of pixels. */
+  if (!adaptive_sampling_.use) {
+    result += string_printf("Average time per sample: %f seconds\n",
+                            render_wall_time / num_rendered_samples);
+  }
+
+  return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+  return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+    int num_rendered_samples) const
+{
+  double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      num_rendered_samples);
+
+  if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+    const double remaining_render_time = max(0.0,
+                                             time_limit_ - (time_dt() - state_.start_render_time));
+
+    update_interval = min(update_interval, remaining_render_time);
+  }
+
+  return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+    int num_rendered_samples) const
+{
+  /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+   * within the current render session, or use absolute number of samples with the start sample
+   * taken into account. It will depend on whether the start sample offset clears the render
+   * buffer. */
+
+  if (state_.need_rebalance_at_next_work) {
+    return 0.1;
+  }
+  if (state_.last_rebalance_changed) {
+    return 0.2;
+  }
+
+  if (headless_) {
+    /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+     * progress messages printed to the logs. */
+    return 30.0;
+  }
+
+  if (background_) {
+    if (num_rendered_samples < 32) {
+      return 1.0;
+    }
+    return 2.0;
+  }
+
+  /* Render time and number of samples rendered are used to figure out the display update interval.
+   *  Render time is used to allow for fast display updates in the first few seconds of rendering
+   *  on fast devices. Number of samples rendered is used to allow for potentially quicker display
+   *  updates on slow devices during the first few samples. */
+  const double render_time = path_trace_time_.get_wall();
+  if (render_time < 1) {
+    return 0.1;
+  }
+  if (render_time < 2) {
+    return 0.25;
+  }
+  if (render_time < 4) {
+    return 0.5;
+  }
+  if (render_time < 8 || num_rendered_samples < 32) {
+    return 1.0;
+  }
+  return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+  const double time_per_sample_average = path_trace_time_.get_average();
+  const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+  const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+  return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+  return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+  if (num_samples == 1) {
+    return 1;
+  }
+
+  if (is_power_of_two(num_samples)) {
+    return num_samples;
+  }
+
+  const uint num_samples_up = next_power_of_two(num_samples);
+  const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+  const uint delta_up = num_samples_up - num_samples;
+  const uint delta_down = num_samples - num_samples_down;
+
+  if (delta_up <= delta_down) {
+    return num_samples_up;
+  }
+
+  return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return get_num_samples_during_navigation(state_.resolution_divider);
+  }
+
+  /* Always start full resolution render  with a single sample. Gives more instant feedback to
+   * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+   * headless mode as well, to give some estimate of how long samples are taking. */
+  if (state_.num_rendered_samples == 0) {
+    return 1;
+  }
+
+  const int num_samples_per_update = calculate_num_samples_per_update();
+  const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+  /* Round number of samples to a power of two, so that division of path states into tiles goes in
+   * a more integer manner.
+   * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+   * is not huge deal because it is not seen that more than 8 samples can be rendered between
+   * updates. If that becomes a problem we can add some extra rules like never allow to round up
+   * more than N samples. */
+  const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+  const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+  int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+  /* When enough statistics is available and doing an offlien rendering prefer to keep device
+   * occupied. */
+  if (state_.occupancy_num_samples && (background_ || headless_)) {
+    /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+     * with good performance without forcing occupancy to be higher). */
+    int num_samples_to_occupy = state_.occupancy_num_samples;
+    if (state_.occupancy < 0.5f) {
+      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+    }
+
+    num_samples_to_render = max(num_samples_to_render,
+                                min(num_samples_to_occupy, max_num_samples_to_render));
+  }
+
+  /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+   * device fully occupied, without much overhead of display updates. */
+  if (!adaptive_sampling_.use) {
+    return num_samples_to_render;
+  }
+
+  /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+   * is to ensure that the final render is pixel-matched regardless of how many samples per second
+   * compute device can do. */
+
+  return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+  /* Special trick for fast navigation: schedule multiple samples during fast navigation
+   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+   * usable visual feedback for artists. There are a couple of tricks though. */
+
+  if (is_denoise_active_during_update()) {
+    /* When denoising is used during navigation prefer using a higher resolution with less samples
+     * (scheduling less samples here will make it so the resolution_divider calculation will use a
+     * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+     * give visually better results on a higher resolution image with less samples. */
+    return 1;
+  }
+
+  if (resolution_divider <= pixel_size_) {
+    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+     * the sample count at this resolution division, but instead assists in the calculation of
+     * the resolution divider. */
+    return 1;
+  }
+
+  if (resolution_divider == pixel_size_ * 2) {
+    /* When resolution divider is the previous step to the final resolution, schedule two samples.
+     * This is so that rendering on lower resolution does not exceed time that it takes to render
+     * first sample at the full resolution. */
+    return 2;
+  }
+
+  /* Always render 4 samples, even if scene is configured for less.
+   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+   * to have 4 time extra samples, so verall worst case timing is the same as the final resolution
+   * at one sample. */
+  return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+  return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+  if (!use_progressive_noise_floor_) {
+    return adaptive_sampling_.threshold;
+  }
+
+  return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+  delayed = false;
+  ready_to_display = true;
+
+  if (!denoiser_params_.use) {
+    /* Denoising is disabled, no need to scheduler work for it. */
+    return false;
+  }
+
+  if (done()) {
+    /* Always denoise at the last sample. */
+    return true;
+  }
+
+  if (background_) {
+    /* Background render, only denoise when rendering the last sample. */
+    /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+     * image looks like even for the background rendering. */
+    return false;
+  }
+
+  /* Viewport render. */
+
+  /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+   * final samples. */
+  const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+                                       state_.num_rendered_samples :
+                                       1;
+
+  /* Immediately denoise when we reach the start sample or last sample. */
+  if (num_samples_finished == denoiser_params_.start_sample ||
+      num_samples_finished == num_samples_) {
+    return true;
+  }
+
+  /* Do not denoise until the sample at which denoising should start is reached. */
+  if (num_samples_finished < denoiser_params_.start_sample) {
+    ready_to_display = false;
+    return false;
+  }
+
+  /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+   */
+  /* TODO(sergey): Consider making time interval and sample configurable. */
+  delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+             (time_dt() - state_.last_display_update_time) < 1.0);
+
+  return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+  if (headless_) {
+    /* Force disable display update in headless mode. There will be nothing to display the
+     * in-progress result. */
+    return false;
+  }
+
+  if (denoiser_delayed) {
+    /* If denoiser has been delayed the display can not be updated as it will not contain
+     * up-to-date state of the render result. */
+    return false;
+  }
+
+  if (!adaptive_sampling_.use) {
+    /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+     * device busy for long enough, so that the display update can happen right after the
+     * rendering. */
+    return true;
+  }
+
+  if (done() || state_.last_display_update_sample == -1) {
+    /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+     */
+    return true;
+  }
+
+  /* For the development purposes of adaptive sampling it might be very useful to see all updates
+   * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+   * users. Possibly, make it a debug panel option to allow rapid update to ease development
+   * without need to re-compiled. */
+  // if (work_need_adaptive_filter()) {
+  //   return true;
+  // }
+
+  /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+   * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+   * We take care of skipping updates here based on when previous display update did happen. */
+  const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+      state_.last_display_update_sample);
+  return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+  /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+   * work. */
+  static const double kRebalanceIntervalInSeconds = 1;
+
+  if (!need_schedule_rebalance_works_) {
+    return false;
+  }
+
+  if (state_.resolution_divider != pixel_size_) {
+    /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+     *  - It will introduce unnecessary during navigation.
+     *  - Per-render device timing information is not very reliable yet. */
+    return false;
+  }
+
+  if (state_.num_rendered_samples == 0) {
+    state_.need_rebalance_at_next_work = true;
+    return false;
+  }
+
+  if (state_.need_rebalance_at_next_work) {
+    state_.need_rebalance_at_next_work = false;
+    return true;
+  }
+
+  if (state_.last_rebalance_changed) {
+    return true;
+  }
+
+  return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+  if (start_resolution_divider_ == 0) {
+    /* Resolution divider has never been calculated before: use default resolution, so that we have
+     * somewhat good initial behavior, giving a chance to collect real numbers. */
+    start_resolution_divider_ = default_start_resolution_divider_;
+    VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+    return;
+  }
+
+  if (first_render_time_.path_trace_per_sample == 0.0) {
+    /* Not enough information to calculate better resolution, keep the existing one. */
+    return;
+  }
+
+  const double desired_update_interval_in_seconds =
+      guess_viewport_navigation_update_interval_in_seconds();
+
+  const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+                                        first_render_time_.denoise_time +
+                                        first_render_time_.display_update_time;
+
+  /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+   * resolution we prefer to use it instead of going way lower resolution and time way below the
+   * desired one. */
+  const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+      desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+   * render time is somewhere on a boundary between two resolutions. */
+
+  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+   * simple and compute device is fast). */
+  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+  VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+  if (is_denoise_active_during_update()) {
+    /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+     * image from. With the faster updates and extra compute required the resolution becomes too
+     * low to give usable feedback. */
+    /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+     * on GPU the value might need to become lower for faster navigation. */
+    return 1.0 / 12.0;
+  }
+
+  /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+   * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+   * values of the resolution divider which does not give very pleasant updates during navigation.
+   * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+  /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+   * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+  return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+  if (!denoiser_params_.use) {
+    return false;
+  }
+
+  if (denoiser_params_.start_sample > 1) {
+    return false;
+  }
+
+  return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+  return render_work.resolution_divider == pixel_size_ &&
+         render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+  /* When rendering at a non-final resolution divider time average is not very useful because it
+   * will either bias average down (due to lower render times on the smaller images) or will give
+   * incorrect result when trying to estimate time which would have spent on the final resolution.
+   *
+   * So we only accumulate average for the latest resolution divider which was rendered. */
+  return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+  if (time_limit_ == 0.0) {
+    /* No limit is enforced. */
+    return;
+  }
+
+  if (state_.start_render_time == 0.0) {
+    /* Rendering did not start yet. */
+    return;
+  }
+
+  const double current_time = time_dt();
+
+  if (current_time - state_.start_render_time < time_limit_) {
+    /* Time limit is not reached yet. */
+    return;
+  }
+
+  state_.time_limit_reached = true;
+  state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+  /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+  int resolution_divider = 1;
+
+  /* This algorithm iterates through resolution dividers until a divider is found that achieves
+   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+   * pre_resolution_division_samples and post_resolution_division_samples are used in this
+   * calculation to better predict the performance impact of changing resolution divisions as
+   * the sample count can also change between resolution divisions. */
+  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    resolution_divider = resolution_divider * 2;
+    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+  if (resolution == INT_MAX) {
+    return 1;
+  }
+
+  int resolution_divider = 1;
+  while (width * height > resolution * resolution) {
+    width = max(1, width / 2);
+    height = max(1, height / 2);
+
+    resolution_divider <<= 1;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+  const int pixel_area = width * height;
+  const int resolution = lround(sqrt(pixel_area));
+
+  return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..9c2d107e46d
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+  int resolution_divider = 1;
+
+  /* Initialize render buffers.
+   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * baking target. */
+  bool init_render_buffers = false;
+
+  /* Path tracing samples information. */
+  struct {
+    int start_sample = 0;
+    int num_samples = 0;
+  } path_trace;
+
+  struct {
+    /* Check for convergency and filter the mask. */
+    bool filter = false;
+
+    float threshold = 0.0f;
+
+    /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+    bool reset = false;
+  } adaptive_sampling;
+
+  struct {
+    bool postprocess = false;
+  } cryptomatte;
+
+  /* Work related on the current tile. */
+  struct {
+    /* Write render buffers of the current tile.
+     *
+     * It is up to the path trace to decide whether writing should happen via user-provided
+     * callback into the rendering software, or via tile manager into a partial file. */
+    bool write = false;
+
+    bool denoise = false;
+  } tile;
+
+  /* Work related on the full-frame render buffer. */
+  struct {
+    /* Write full render result.
+     * Implies reading the partial file from disk. */
+    bool write = false;
+  } full;
+
+  /* Display which is used to visualize render result. */
+  struct {
+    /* Display needs to be updated for the new render. */
+    bool update = false;
+
+    /* Display can use denoised result if available. */
+    bool use_denoised_result = true;
+  } display;
+
+  /* Re-balance multi-device scheduling after rendering this work.
+   * Note that the scheduler does not know anything abouce devices, so if there is only a single
+   * device used, then it is up for the PathTracer to ignore the balancing. */
+  bool rebalance = false;
+
+  /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+   * work. */
+  inline operator bool() const
+  {
+    return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+           tile.write || full.write;
+  }
+};
+
+class RenderScheduler {
+ public:
+  RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+  /* Specify whether cryptomatte-related works are to be scheduled. */
+  void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+  /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+   * as possible. */
+  void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+  bool is_background() const;
+
+  void set_denoiser_params(const DenoiseParams &params);
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  bool is_adaptive_sampling_used() const;
+
+  /* Start sample for path tracing.
+   * The scheduler will schedule work using this sample as the first one. */
+  void set_start_sample(int start_sample);
+  int get_start_sample() const;
+
+  /* Number of samples to render, starting from start sample.
+   * The scheduler will schedule work in the range of
+   * [start_sample, start_sample + num_samples - 1], inclusively. */
+  void set_num_samples(int num_samples);
+  int get_num_samples() const;
+
+  /* Time limit for the path tracing tasks, in minutes.
+   * Zero disables the limit. */
+  void set_time_limit(double time_limit);
+  double get_time_limit() const;
+
+  /* Get sample up to which rendering has been done.
+   * This is an absolute 0-based value.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 14.
+   *
+   * If there were no samples rendered, then the behavior is undefined. */
+  int get_rendered_sample() const;
+
+  /* Get number of samples rendered within the current scheduling session.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 5.
+   *
+   * Note that this is based on the scheduling information. In practice this means that if someone
+   * requested for work to render the scheduler considers the work done. */
+  int get_num_rendered_samples() const;
+
+  /* Reset scheduler, indicating that rendering will happen from scratch.
+   * Resets current rendered state, as well as scheduling information. */
+  void reset(const BufferParams &buffer_params, int num_samples);
+
+  /* Reset scheduler upon switching to a next tile.
+   * Will keep the same number of samples and full-frame render parameters, but will reset progress
+   * and allow schedule renders works from the beginning of the new tile. */
+  void reset_for_next_tile();
+
+  /* Reschedule adaptive sampling work when all pixels did converge.
+   * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+   * final threshold) then false is returned and the render scheduler will stop scheduling path
+   * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+   * a lower threshold. */
+  bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+  /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+   * converged.
+   * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+   * the path tracer is to finish the current pixels) then false is returned. */
+  bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+  /* Reschedule work when rendering has been requested to cancel.
+   *
+   * Will skip all work which is not needed anymore because no more samples will be added (for
+   * example, adaptive sampling filtering and convergence check will be skipped).
+   * Will enable all work needed to make sure all passes are communicated to the software.
+   *
+   * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+  void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+  RenderWork get_render_work();
+
+  /* Report that the path tracer started to work, after scene update and loading kernels. */
+  void report_work_begin(const RenderWork &render_work);
+
+  /* Report time (in seconds) which corresponding part of work took. */
+  void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+  void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_denoise_time(const RenderWork &render_work, double time);
+  void report_display_update_time(const RenderWork &render_work, double time);
+  void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+ protected:
+  /* Check whether all work has been scheduled and time limit was not exceeded.
+   *
+   * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+   * extra work needs to be scheduled to denoise and write final result. */
+  bool done() const;
+
+  /* Update scheduling state for a newely scheduled work.
+   * Takes care of things like checking whether work was ever denoised, tile was written and states
+   * like that. */
+  void update_state_for_render_work(const RenderWork &render_work);
+
+  /* Returns true if any work was scheduled. */
+  bool set_postprocess_render_work(RenderWork *render_work);
+
+  /*  Set work which is to be performed after all tiles has been rendered. */
+  void set_full_frame_render_work(RenderWork *render_work);
+
+  /* Update start resolution divider based on the accumulated timing information, preserving nice
+   * feeling navigation feel. */
+  void update_start_resolution_divider();
+
+  /* Calculate desired update interval in seconds based on the current timings and settings.
+   * Will give an interval which provides good feeling updates during viewport navigation. */
+  double guess_viewport_navigation_update_interval_in_seconds() const;
+
+  /* Check whether denoising is active during interactive update while resolution divider is not
+   * unit. */
+  bool is_denoise_active_during_update() const;
+
+  /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+   * lower samples and near the beginning of rendering, updates happen more often, but with higher
+   * number of samples and later in the render, updates happen less often but device occupancy
+   * goes higher. */
+  double guess_display_update_interval_in_seconds() const;
+  double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+  double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      int num_rendered_samples) const;
+
+  /* Calculate number of samples which can be rendered within current desred update interval which
+   * is calculated by `guess_update_interval_in_seconds()`. */
+  int calculate_num_samples_per_update() const;
+
+  /* Get start sample and the number of samples which are to be path traces in the current work. */
+  int get_start_sample_to_path_trace() const;
+  int get_num_samples_to_path_trace() const;
+
+  /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+   */
+  int get_num_samples_during_navigation(int resolution_divier) const;
+
+  /* Whether adaptive sampling convergence check and filter is to happen. */
+  bool work_need_adaptive_filter() const;
+
+  /* Calculate thretshold for adaptive sampling. */
+  float work_adaptive_threshold() const;
+
+  /* Check whether current work needs denoising.
+   * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too
+   * often.
+   *
+   * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+   * later sample, to reduce overhead.
+   *
+   * ready_to_display will be false if we may have a denoised result that is outdated due to
+   * increased samples. */
+  bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+  /* Check whether current work need to update display.
+   *
+   * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+  bool work_need_update_display(const bool denoiser_delayed);
+
+  /* Check whether it is time to perform rebalancing for the render work, */
+  bool work_need_rebalance();
+
+  /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+   * for the resolution divider calculation. */
+  bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+  /* Check whether timing report about the given work need to reset accumulated average time. */
+  bool work_report_reset_average(const RenderWork &render_work);
+
+  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+   * information in the state so that rendering is considered finished, and is possible to report
+   * average render time information. */
+  void check_time_limit_reached();
+
+  /* Helper class to keep track of task timing.
+   *
+   * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+   * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+   *
+   * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+   * takes to perform task on the final resolution. */
+  class TimeWithAverage {
+   public:
+    inline void reset()
+    {
+      total_wall_time_ = 0.0;
+
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+    inline void add_wall(double time)
+    {
+      total_wall_time_ += time;
+    }
+
+    inline void add_average(double time, int num_measurements = 1)
+    {
+      average_time_accumulator_ += time;
+      num_average_times_ += num_measurements;
+    }
+
+    inline double get_wall() const
+    {
+      return total_wall_time_;
+    }
+
+    inline double get_average() const
+    {
+      if (num_average_times_ == 0) {
+        return 0;
+      }
+      return average_time_accumulator_ / num_average_times_;
+    }
+
+    inline void reset_average()
+    {
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+   protected:
+    double total_wall_time_ = 0.0;
+
+    double average_time_accumulator_ = 0.0;
+    int num_average_times_ = 0;
+  };
+
+  struct {
+    int resolution_divider = 1;
+
+    /* Number of rendered samples on top of the start sample. */
+    int num_rendered_samples = 0;
+
+    /* Point in time the latest GPUDisplay work has been scheduled. */
+    double last_display_update_time = 0.0;
+    /* Value of -1 means display was never updated. */
+    int last_display_update_sample = -1;
+
+    /* Point in time at which last rebalance has been performed. */
+    double last_rebalance_time = 0.0;
+
+    /* Number of rebalance works which has been requested to be performed.
+     * The path tracer might ignore the work if there is a single device rendering. */
+    int num_rebalance_requested = 0;
+
+    /* Number of rebalance works handled which did change balance across devices. */
+    int num_rebalance_changes = 0;
+
+    bool need_rebalance_at_next_work = false;
+
+    /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+     * devices. */
+    bool last_rebalance_changed = false;
+
+    /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+     * noise floor. */
+    float adaptive_sampling_threshold = 0.0f;
+
+    bool last_work_tile_was_denoised = false;
+    bool tile_result_was_written = false;
+    bool postprocess_work_scheduled = false;
+    bool full_frame_work_scheduled = false;
+    bool full_frame_was_written = false;
+
+    bool path_trace_finished = false;
+    bool time_limit_reached = false;
+
+    /* Time at which rendering started and finished. */
+    double start_render_time = 0.0;
+    double end_render_time = 0.0;
+
+    /* Measured occupancy of the render devices measured normalized to the number of samples.
+     *
+     * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+     * previous work was rendered. */
+    int occupancy_num_samples = 0;
+    float occupancy = 1.0f;
+  } state_;
+
+  /* Timing of tasks which were performed at the very first render work at 100% of the
+   * resolution. This timing information is used to estimate resolution divider for fats
+   * navigation. */
+  struct {
+    double path_trace_per_sample;
+    double denoise_time;
+    double display_update_time;
+  } first_render_time_;
+
+  TimeWithAverage path_trace_time_;
+  TimeWithAverage adaptive_filter_time_;
+  TimeWithAverage denoise_time_;
+  TimeWithAverage display_update_time_;
+  TimeWithAverage rebalance_time_;
+
+  /* Whether cryptomatte-related work will be scheduled. */
+  bool need_schedule_cryptomatte_ = false;
+
+  /* Whether to schedule device load rebalance works.
+   * Rebalancing requires some special treatment for update intervals and such, so if it's known
+   * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+   * ignore rebalancing logic. */
+  bool need_schedule_rebalance_works_ = false;
+
+  /* Path tracing work will be scheduled for samples from within
+   * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+  int start_sample_ = 0;
+  int num_samples_ = 0;
+
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit_ = 0.0;
+
+  /* Headless rendering without interface. */
+  bool headless_;
+
+  /* Background (offline) rendering. */
+  bool background_;
+
+  /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+   * types of hi-dpi displays. */
+  int pixel_size_ = 1;
+
+  TileManager &tile_manager_;
+
+  BufferParams buffer_params_;
+  DenoiseParams denoiser_params_;
+
+  AdaptiveSampling adaptive_sampling_;
+
+  /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+   * level. */
+  bool use_progressive_noise_floor_ = false;
+
+  /* Default value for the resolution divider which will be used when there is no render time
+   * information available yet.
+   * It is also what defines the upper limit of the automatically calculated resolution divider. */
+  int default_start_resolution_divider_ = 1;
+
+  /* Initial resolution divider which will be used on render scheduler reset. */
+  int start_resolution_divider_ = 0;
+
+  /* Calculate smallest resolution divider which will bring down actual rendering time below the
+   * desired one. This call assumes linear dependency of render time from number of pixels
+   * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+   * down by a factor of 4. */
+  int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..465b4a8d4da
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+  DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+                      const int max_num_points,
+                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+                      const function<void(device_vector<float4> &)> &read_output)
+{
+  bool first_device = true;
+  bool success = true;
+
+  device_->foreach_device([&](Device *device) {
+    if (!first_device) {
+      LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+                    "single device.";
+      return;
+    }
+    first_device = false;
+
+    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+    /* Allocate and copy device buffers. */
+    DCHECK_EQ(input.device, device);
+    DCHECK_EQ(output.device, device);
+    DCHECK_LE(output.size(), input.size());
+
+    input.alloc(max_num_points);
+    int num_points = fill_input(input);
+    if (num_points == 0) {
+      return;
+    }
+
+    input.copy_to_device();
+    output.alloc(num_points);
+    output.zero_to_device();
+
+    /* Evaluate on CPU or GPU. */
+    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+                                                  eval_gpu(device, type, input, output);
+
+    /* Copy data back from device if not cancelled. */
+    if (success) {
+      output.copy_from_device(0, 1, output.size());
+      read_output(output);
+    }
+
+    input.free();
+    output.free();
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+  /* Find required kernel function. */
+  const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+  /* Simple parallel_for over all work items. */
+  const int64_t work_size = output.size();
+  KernelShaderEvalInput *input_data = input.data();
+  float4 *output_data = output.data();
+  bool success = true;
+
+  tbb::task_arena local_arena(device->info.cpu_threads);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+      /* TODO: is this fast enough? */
+      if (progress_.get_cancel()) {
+        success = false;
+        return;
+      }
+
+      const int thread_index = tbb::this_task_arena::current_thread_index();
+      KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+      switch (type) {
+        case SHADER_EVAL_DISPLACE:
+          kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+          break;
+        case SHADER_EVAL_BACKGROUND:
+          kernels.shader_eval_background(kg, input_data, output_data, work_index);
+          break;
+      }
+    });
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  /* Find required kernel function. */
+  DeviceKernel kernel;
+  switch (type) {
+    case SHADER_EVAL_DISPLACE:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+      break;
+    case SHADER_EVAL_BACKGROUND:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+      break;
+  };
+
+  /* Create device queue. */
+  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+  queue->init_execution();
+
+  /* Execute work on GPU in chunk, so we can cancel.
+   * TODO : query appropriate size from device.*/
+  const int chunk_size = 65536;
+
+  const int work_size = output.size();
+  void *d_input = (void *)input.device_pointer;
+  void *d_output = (void *)output.device_pointer;
+
+  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int d_work_size = min(chunk_size, work_size - d_offset);
+    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+    queue->enqueue(kernel, d_work_size, args);
+    queue->synchronize();
+
+    if (progress_.get_cancel()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+  SHADER_EVAL_DISPLACE,
+  SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+  ShaderEval(Device *device, Progress &progress);
+
+  /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+   * RGBA colors to output. */
+  bool eval(const ShaderEvalType type,
+            const int max_num_points,
+            const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+            const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+  bool eval_cpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+  bool eval_gpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+
+  Device *device_;
+  Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+  os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+  os << ", num_samples: " << tile_size.num_samples;
+  return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states)
+{
+  if (max_num_path_states == 1) {
+    /* Simple case: avoid any calculation, which could cause rounding issues. */
+    return TileSize(1, 1, 1);
+  }
+
+  const int64_t num_pixels = image_size.x * image_size.y;
+  const int64_t num_pixel_samples = num_pixels * num_samples;
+
+  if (max_num_path_states >= num_pixel_samples) {
+    /* Image fully fits into the state (could be border render, for example). */
+    return TileSize(image_size.x, image_size.y, num_samples);
+  }
+
+  /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+   * across threads.
+   *
+   * Some general ideas:
+   *  - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+   *  - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+  TileSize tile_size;
+
+  /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+   * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+   * multiple tiles with the same coordinates rendering different samples. */
+  const int num_path_states_per_sample = max_num_path_states / num_samples;
+  if (num_path_states_per_sample != 0) {
+    tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+    tile_size.height = tile_size.width;
+  }
+  else {
+    tile_size.width = tile_size.height = 1;
+  }
+
+  if (num_samples == 1) {
+    tile_size.num_samples = 1;
+  }
+  else {
+    /* Heuristic here is to have more uniform division of the sample range: for example prefer
+     * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+    tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+                                static_cast<uint>(num_samples));
+
+    const int tile_area = tile_size.width / tile_size.height;
+    tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+  }
+
+  DCHECK_GE(tile_size.width, 1);
+  DCHECK_GE(tile_size.height, 1);
+  DCHECK_GE(tile_size.num_samples, 1);
+  DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+  return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+  TileSize() = default;
+
+  inline TileSize(int width, int height, int num_samples)
+      : width(width), height(height), num_samples(num_samples)
+  {
+  }
+
+  inline bool operator==(const TileSize &other) const
+  {
+    return width == other.width && height == other.height && num_samples == other.num_samples;
+  }
+  inline bool operator!=(const TileSize &other) const
+  {
+    return !(*this == other);
+  }
+
+  int width = 0, height = 0;
+  int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  if (num_infos == 1) {
+    work_balance_infos[0].weight = 1.0;
+    return;
+  }
+
+  /* There is no statistics available, so start with an equal distribution. */
+  const double weight = 1.0 / num_infos;
+  for (WorkBalanceInfo &balance_info : work_balance_infos) {
+    balance_info.weight = weight;
+  }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  double total_time = 0;
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    total_time += info.time_spent;
+  }
+  return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  const double total_time = calculate_total_time(work_balance_infos);
+  const double time_average = total_time / num_infos;
+
+  double total_weight = 0;
+  vector<double> new_weights;
+  new_weights.reserve(num_infos);
+
+  /* Equalize the overall average time. This means that we don't make it so every work will perform
+   * amount of work based on the current average, but that after the weights changes the time will
+   * equalize.
+   * Can think of it that if one of the devices is 10% faster than another, then one device needs
+   * to do 5% less of the current work, and another needs to do 5% more. */
+  const double lerp_weight = 1.0 / num_infos;
+
+  bool has_big_difference = false;
+
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+    const double new_weight = info.weight * time_target / info.time_spent;
+    new_weights.push_back(new_weight);
+    total_weight += new_weight;
+
+    if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+      has_big_difference = true;
+    }
+  }
+
+  if (!has_big_difference) {
+    return false;
+  }
+
+  const double total_weight_inv = 1.0 / total_weight;
+  for (int i = 0; i < num_infos; ++i) {
+    WorkBalanceInfo &info = work_balance_infos[i];
+    info.weight = new_weights[i] * total_weight_inv;
+    info.time_spent = 0;
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..94e20ecf054
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+  /* Time spent performing corresponding work. */
+  double time_spent = 0;
+
+  /* Average occupancy of the device while performing the work. */
+  float occupancy = 1.0f;
+
+  /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+   * the big tile which is to be rendered on the device). */
+  double weight = 1.0;
+};
+
+/* Balance work for an initial render interation, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..3fc99d5b74d
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+  max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+  /* Image buffer parameters. */
+  image_full_offset_px_.x = buffer_params.full_x;
+  image_full_offset_px_.y = buffer_params.full_y;
+
+  image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+  offset_ = buffer_params.offset;
+  stride_ = buffer_params.stride;
+
+  /* Samples parameters. */
+  sample_start_ = sample_start;
+  samples_num_ = samples_num;
+
+  /* Initialize new scheduling. */
+  reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+  tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+  VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+  if (VLOG_IS_ON(3)) {
+    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+     * and purely focusing on the number of used path states. */
+    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                        tile_size_.num_samples;
+    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+    VLOG(3) << "Number of unused path states: "
+            << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  }
+
+  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+  next_work_index_ = 0;
+  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+  /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+   * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles,
+   * improving overall device occupancy.
+   * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+   * limit". */
+
+  DCHECK_NE(max_num_path_states_, 0);
+
+  const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+  if (work_index >= total_work_size_) {
+    return false;
+  }
+
+  const int sample_range_index = work_index % num_tiles_per_sample_range_;
+  const int start_sample = sample_range_index * tile_size_.num_samples;
+  const int tile_index = work_index / num_tiles_per_sample_range_;
+  const int tile_y = tile_index / num_tiles_x_;
+  const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+  KernelWorkTile work_tile;
+  work_tile.x = tile_x * tile_size_.width;
+  work_tile.y = tile_y * tile_size_.height;
+  work_tile.w = tile_size_.width;
+  work_tile.h = tile_size_.height;
+  work_tile.start_sample = sample_start_ + start_sample;
+  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.offset = offset_;
+  work_tile.stride = stride_;
+
+  work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+  work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+  work_tile.x += image_full_offset_px_.x;
+  work_tile.y += image_full_offset_px_.y;
+
+  const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+  DCHECK_GT(tile_work_size, 0);
+
+  if (max_work_size && tile_work_size > max_work_size) {
+    /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+     * allowing others (or ourselves later one) to pick it up.
+     *
+     * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+     * device sees there is nothing to be done, finishing its work and leaving all work to be
+     * done by us. */
+    atomic_fetch_and_add_int32(&next_work_index_, -1);
+    return false;
+  }
+
+  *work_tile_ = work_tile;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..e4c8f701259
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+  WorkTileScheduler();
+
+  /* MAximum path states which are allowed to be used by a single scheduled work tile.
+   *
+   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+   * this number of states. */
+  void set_max_num_path_states(int max_num_path_states);
+
+  /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+  void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+  /* Get work for a device.
+   * Returns true if there is still work to be done and initialize the work tile to all
+   * parameters of this work. If there is nothing remaining to be done, returns false and the
+   * work tile is kept unchanged.
+   *
+   * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+  bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+  void reset_scheduler_state();
+
+  /* Maximum allowed path states to be used.
+   *
+   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+   * number of path states is kind of a detail. Is there a more generic term from the scheduler
+   * point of view? */
+  int max_num_path_states_ = 0;
+
+  /* Offset in pixels within a global buffer. */
+  int2 image_full_offset_px_ = make_int2(0, 0);
+
+  /* dimensions of the currently rendering image in pixels. */
+  int2 image_size_px_ = make_int2(0, 0);
+
+  /* Offset and stride of the buffer within which scheduing is happenning.
+   * Will be passed over to the KernelWorkTile. */
+  int offset_, stride_;
+
+  /* Start sample of index and number of samples which are to be rendered.
+   * The scheduler will cover samples range of [start, start + num] over the entire image
+   * (splitting into a smaller work tiles). */
+  int sample_start_ = 0;
+  int samples_num_ = 0;
+
+  /* Tile size which be scheduled for rendering. */
+  TileSize tile_size_;
+
+  /* Number of tiles in X and Y axis of the image. */
+  int num_tiles_x_, num_tiles_y_;
+
+  /* Total number of tiles on the image.
+   * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+   *
+   * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+   * in the `get_work()`? */
+  int total_tiles_num_ = 0;
+
+  /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes
+   * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+  int num_tiles_per_sample_range_ = 0;
+
+  int next_work_index_ = 0;
+  int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0ce33c51778..4196539a9b1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -22,68 +22,22 @@ set(INC_SYS
 
 )
 
-set(SRC_CPU_KERNELS
-  kernels/cpu/kernel.cpp
-  kernels/cpu/kernel_sse2.cpp
-  kernels/cpu/kernel_sse3.cpp
-  kernels/cpu/kernel_sse41.cpp
-  kernels/cpu/kernel_avx.cpp
-  kernels/cpu/kernel_avx2.cpp
-  kernels/cpu/kernel_split.cpp
-  kernels/cpu/kernel_split_sse2.cpp
-  kernels/cpu/kernel_split_sse3.cpp
-  kernels/cpu/kernel_split_sse41.cpp
-  kernels/cpu/kernel_split_avx.cpp
-  kernels/cpu/kernel_split_avx2.cpp
-  kernels/cpu/filter.cpp
-  kernels/cpu/filter_sse2.cpp
-  kernels/cpu/filter_sse3.cpp
-  kernels/cpu/filter_sse41.cpp
-  kernels/cpu/filter_avx.cpp
-  kernels/cpu/filter_avx2.cpp
+set(SRC_DEVICE_CPU
+  device/cpu/kernel.cpp
+  device/cpu/kernel_sse2.cpp
+  device/cpu/kernel_sse3.cpp
+  device/cpu/kernel_sse41.cpp
+  device/cpu/kernel_avx.cpp
+  device/cpu/kernel_avx2.cpp
 )
 
-set(SRC_CUDA_KERNELS
-  kernels/cuda/kernel.cu
-  kernels/cuda/kernel_split.cu
-  kernels/cuda/filter.cu
+set(SRC_DEVICE_CUDA
+  device/cuda/kernel.cu
 )
 
-set(SRC_OPENCL_KERNELS
-  kernels/opencl/kernel_adaptive_stopping.cl
-  kernels/opencl/kernel_adaptive_filter_x.cl
-  kernels/opencl/kernel_adaptive_filter_y.cl
-  kernels/opencl/kernel_adaptive_adjust_samples.cl
-  kernels/opencl/kernel_bake.cl
-  kernels/opencl/kernel_base.cl
-  kernels/opencl/kernel_displace.cl
-  kernels/opencl/kernel_background.cl
-  kernels/opencl/kernel_state_buffer_size.cl
-  kernels/opencl/kernel_split_bundle.cl
-  kernels/opencl/kernel_data_init.cl
-  kernels/opencl/kernel_path_init.cl
-  kernels/opencl/kernel_queue_enqueue.cl
-  kernels/opencl/kernel_scene_intersect.cl
-  kernels/opencl/kernel_lamp_emission.cl
-  kernels/opencl/kernel_do_volume.cl
-  kernels/opencl/kernel_indirect_background.cl
-  kernels/opencl/kernel_shader_setup.cl
-  kernels/opencl/kernel_shader_sort.cl
-  kernels/opencl/kernel_shader_eval.cl
-  kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
-  kernels/opencl/kernel_subsurface_scatter.cl
-  kernels/opencl/kernel_direct_lighting.cl
-  kernels/opencl/kernel_shadow_blocked_ao.cl
-  kernels/opencl/kernel_shadow_blocked_dl.cl
-  kernels/opencl/kernel_enqueue_inactive.cl
-  kernels/opencl/kernel_next_iteration_setup.cl
-  kernels/opencl/kernel_indirect_subsurface.cl
-  kernels/opencl/kernel_buffer_update.cl
-  kernels/opencl/filter.cl
-)
-
-set(SRC_OPTIX_KERNELS
-  kernels/optix/kernel_optix.cu
+set(SRC_DEVICE_OPTIX
+  device/optix/kernel.cu
+  device/optix/kernel_shader_raytrace.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -105,63 +59,56 @@ set(SRC_HEADERS
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
-  kernel_compat_cpu.h
-  kernel_compat_cuda.h
-  kernel_compat_optix.h
-  kernel_compat_opencl.h
   kernel_differential.h
   kernel_emission.h
   kernel_film.h
-  kernel_globals.h
   kernel_id_passes.h
   kernel_jitter.h
   kernel_light.h
   kernel_light_background.h
   kernel_light_common.h
+  kernel_lookup_table.h
   kernel_math.h
   kernel_montecarlo.h
   kernel_passes.h
-  kernel_path.h
-  kernel_path_branched.h
-  kernel_path_common.h
   kernel_path_state.h
-  kernel_path_surface.h
-  kernel_path_subsurface.h
-  kernel_path_volume.h
   kernel_profiling.h
   kernel_projection.h
-  kernel_queues.h
   kernel_random.h
   kernel_shader.h
-  kernel_shadow.h
-  kernel_subsurface.h
+  kernel_shadow_catcher.h
   kernel_textures.h
   kernel_types.h
-  kernel_volume.h
   kernel_work_stealing.h
   kernel_write_passes.h
 )
 
-set(SRC_KERNELS_CPU_HEADERS
-  kernel.h
-  kernels/cpu/kernel_cpu.h
-  kernels/cpu/kernel_cpu_impl.h
-  kernels/cpu/kernel_cpu_image.h
-  kernels/cpu/filter_cpu.h
-  kernels/cpu/filter_cpu_impl.h
+set(SRC_DEVICE_CPU_HEADERS
+  device/cpu/compat.h
+  device/cpu/image.h
+  device/cpu/globals.h
+  device/cpu/kernel.h
+  device/cpu/kernel_arch.h
+  device/cpu/kernel_arch_impl.h
 )
-
-set(SRC_KERNELS_CUDA_HEADERS
-  kernels/cuda/kernel_config.h
-  kernels/cuda/kernel_cuda_image.h
+set(SRC_DEVICE_GPU_HEADERS
+  device/gpu/image.h
+  device/gpu/kernel.h
+  device/gpu/parallel_active_index.h
+  device/gpu/parallel_prefix_sum.h
+  device/gpu/parallel_reduce.h
+  device/gpu/parallel_sorted_index.h
 )
 
-set(SRC_KERNELS_OPTIX_HEADERS
+set(SRC_DEVICE_CUDA_HEADERS
+  device/cuda/compat.h
+  device/cuda/config.h
+  device/cuda/globals.h
 )
 
-set(SRC_KERNELS_OPENCL_HEADERS
-  kernels/opencl/kernel_split_function.h
-  kernels/opencl/kernel_opencl_image.h
+set(SRC_DEVICE_OPTIX_HEADERS
+  device/optix/compat.h
+  device/optix/globals.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -259,25 +206,32 @@ set(SRC_GEOM_HEADERS
   geom/geom_object.h
   geom/geom_patch.h
   geom/geom_primitive.h
+  geom/geom_shader_data.h
   geom/geom_subd_triangle.h
   geom/geom_triangle.h
   geom/geom_triangle_intersect.h
   geom/geom_volume.h
 )
 
-set(SRC_FILTER_HEADERS
-  filter/filter.h
-  filter/filter_defines.h
-  filter/filter_features.h
-  filter/filter_features_sse.h
-  filter/filter_kernel.h
-  filter/filter_nlm_cpu.h
-  filter/filter_nlm_gpu.h
-  filter/filter_prefilter.h
-  filter/filter_reconstruction.h
-  filter/filter_transform.h
-  filter/filter_transform_gpu.h
-  filter/filter_transform_sse.h
+set(SRC_INTEGRATOR_HEADERS
+  integrator/integrator_init_from_bake.h
+  integrator/integrator_init_from_camera.h
+  integrator/integrator_intersect_closest.h
+  integrator/integrator_intersect_shadow.h
+  integrator/integrator_intersect_subsurface.h
+  integrator/integrator_intersect_volume_stack.h
+  integrator/integrator_megakernel.h
+  integrator/integrator_shade_background.h
+  integrator/integrator_shade_light.h
+  integrator/integrator_shade_shadow.h
+  integrator/integrator_shade_surface.h
+  integrator/integrator_shade_volume.h
+  integrator/integrator_state.h
+  integrator/integrator_state_flow.h
+  integrator/integrator_state_template.h
+  integrator/integrator_state_util.h
+  integrator/integrator_subsurface.h
+  integrator/integrator_volume_stack.h
 )
 
 set(SRC_UTIL_HEADERS
@@ -333,36 +287,6 @@ set(SRC_UTIL_HEADERS
   ../util/util_types_vector3_impl.h
 )
 
-set(SRC_SPLIT_HEADERS
-  split/kernel_adaptive_adjust_samples.h
-  split/kernel_adaptive_filter_x.h
-  split/kernel_adaptive_filter_y.h
-  split/kernel_adaptive_stopping.h
-  split/kernel_branched.h
-  split/kernel_buffer_update.h
-  split/kernel_data_init.h
-  split/kernel_direct_lighting.h
-  split/kernel_do_volume.h
-  split/kernel_enqueue_inactive.h
-  split/kernel_holdout_emission_blurring_pathtermination_ao.h
-  split/kernel_indirect_background.h
-  split/kernel_indirect_subsurface.h
-  split/kernel_lamp_emission.h
-  split/kernel_next_iteration_setup.h
-  split/kernel_path_init.h
-  split/kernel_queue_enqueue.h
-  split/kernel_scene_intersect.h
-  split/kernel_shader_setup.h
-  split/kernel_shader_sort.h
-  split/kernel_shader_eval.h
-  split/kernel_shadow_blocked_ao.h
-  split/kernel_shadow_blocked_dl.h
-  split/kernel_split_common.h
-  split/kernel_split_data.h
-  split/kernel_split_data_types.h
-  split/kernel_subsurface_scatter.h
-)
-
 set(LIB
 
 )
@@ -393,21 +317,17 @@ if(WITH_CYCLES_CUDA_BINARIES)
   endif()
 
   # build for each arch
-  set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
+  set(cuda_sources device/cuda/kernel.cu
     ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
+    ${SRC_DEVICE_GPU_HEADERS}
+    ${SRC_DEVICE_CUDA_HEADERS}
     ${SRC_BVH_HEADERS}
     ${SRC_SVM_HEADERS}
     ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
     ${SRC_CLOSURE_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
-  set(cuda_filter_sources kernels/cuda/filter.cu
-    ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
-    ${SRC_FILTER_HEADERS}
-    ${SRC_UTIL_HEADERS}
-  )
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
@@ -427,7 +347,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
     endif()
 
-    set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+    set(cuda_kernel_src "/device/cuda/${name}.cu")
 
     set(cuda_flags ${flags}
       -D CCL_NAMESPACE_BEGIN=
@@ -435,7 +355,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -D NVCC
       -m ${CUDA_BITS}
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
-      -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
       --use_fast_math
       -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
@@ -523,14 +443,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
     endif()
     if(DEFINED cuda_nvcc_executable AND DEFINED cuda_toolkit_root_dir)
       # Compile regular kernel
-      CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} filter "" "${cuda_filter_sources}" FALSE)
       CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${cuda_sources}" FALSE)
 
-      if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
-        # Compile split kernel
-        CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel_split "-D __SPLIT__" "${cuda_sources}" FALSE)
-      endif()
-
       if(WITH_CYCLES_CUDA_BUILD_SERIAL)
         set(prev_arch ${arch})
       endif()
@@ -547,15 +461,15 @@ endif()
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
-  macro(CYCLES_OPTIX_KERNEL_ADD name flags)
-    set(input "kernels/optix/kernel_optix.cu")
+  macro(CYCLES_OPTIX_KERNEL_ADD name input flags)
     set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
 
     set(cuda_flags ${flags}
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
-      -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
       --use_fast_math
+      -Wno-deprecated-gpu-targets
       -o ${output})
 
     if(WITH_NANOVDB)
@@ -580,11 +494,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND ${CUBIN_CC_ENV}
@@ -603,11 +519,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND
@@ -624,8 +542,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
   endmacro()
 
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix
+    "device/optix/kernel.cu"
+    "")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix_shader_raytrace
+    "device/optix/kernel_shader_raytrace.cu"
+    "--keep-device-functions")
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
@@ -659,62 +583,47 @@ if(WITH_COMPILER_ASAN)
   endif()
 endif()
 
-set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
 if(CXX_HAS_SSE)
-  set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-  set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-  set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 cycles_add_library(cycles_kernel "${LIB}"
-  ${SRC_CPU_KERNELS}
-  ${SRC_CUDA_KERNELS}
-  ${SRC_OPTIX_KERNELS}
-  ${SRC_OPENCL_KERNELS}
+  ${SRC_DEVICE_CPU}
+  ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
-  ${SRC_KERNELS_CPU_HEADERS}
-  ${SRC_KERNELS_CUDA_HEADERS}
-  ${SRC_KERNELS_OPTIX_HEADERS}
-  ${SRC_KERNELS_OPENCL_HEADERS}
+  ${SRC_DEVICE_CPU_HEADERS}
+  ${SRC_DEVICE_GPU_HEADERS}
+  ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
-  ${SRC_FILTER_HEADERS}
   ${SRC_SVM_HEADERS}
   ${SRC_GEOM_HEADERS}
-  ${SRC_SPLIT_HEADERS}
+  ${SRC_INTEGRATOR_HEADERS}
 )
 
 source_group("bvh" FILES ${SRC_BVH_HEADERS})
 source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
-source_group("filter" FILES ${SRC_FILTER_HEADERS})
 source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
-source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
-source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
-source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
-source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
-source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
+source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
+source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
 source_group("svm" FILES ${SRC_SVM_HEADERS})
 
 if(WITH_CYCLES_CUDA)
@@ -724,31 +633,20 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
 
-# OpenCL kernel
-
-# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
-# add_custom_command(
-#    OUTPUT ${KERNEL_PREPROCESSED}
-#    COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
-#    DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
-# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
-# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
+# Install kernel source for runtime compilation
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
-
 
 if(WITH_NANOVDB)
   set(SRC_NANOVDB_HEADERS
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index acf29cf1baf..539e9fd05fb 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,8 @@
  * the code has been extended and modified to support more primitives and work
  * with CPU/CUDA/OpenCL. */
 
+#pragma once
+
 #ifdef __EMBREE__
 #  include "kernel/bvh/bvh_embree.h"
 #endif
@@ -152,13 +154,11 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
   return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
 }
 
-ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
                                           const Ray *ray,
                                           const uint visibility,
                                           Intersection *isect)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT);
-
 #ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -238,15 +238,13 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __BVH_LOCAL__
-ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
                                                 const Ray *ray,
                                                 LocalIntersection *local_isect,
                                                 int local_object,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
@@ -313,8 +311,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
         float3 dir = ray->D;
         float3 idir = ray->D;
         Transform ob_itfm;
-        rtc_ray.tfar = bvh_instance_motion_push(
-            kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm);
+        rtc_ray.tfar = ray->t *
+                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
         /* bvh_instance_motion_push() returns the inverse transform but
          * it's not needed here. */
         (void)ob_itfm;
@@ -353,15 +351,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      uint visibility,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
@@ -401,17 +397,13 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
     ctx.isect_s = isect;
     ctx.max_hits = max_hits;
-    ctx.num_hits = 0;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
 
-    if (ctx.num_hits > max_hits) {
-      return true;
-    }
     *num_hits = ctx.num_hits;
-    return rtc_ray.tfar == -INFINITY;
+    return ctx.opaque_hit;
   }
 #    endif /* __EMBREE__ */
 
@@ -439,13 +431,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 #endif /* __SHADOW_RECORD_ALL__ */
 
 #ifdef __VOLUME__
-ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
                                                  const Ray *ray,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -498,14 +488,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
 #endif /* __VOLUME__ */
 
 #ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+ccl_device_intersect uint scene_intersect_volume_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
-
   if (!scene_intersect_valid(ray)) {
     return false;
   }
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index 4605c3ea51d..092d770dcac 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <embree3/rtcore_ray.h>
 #include <embree3/rtcore_scene.h>
 
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-// clang-format on
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
 
 #include "util/util_vector.h"
 
@@ -36,25 +35,29 @@ struct CCLIntersectContext {
     RAY_VOLUME_ALL = 4,
   } RayType;
 
-  KernelGlobals *kg;
+  const KernelGlobals *kg;
   RayType type;
 
   /* for shadow rays */
   Intersection *isect_s;
   int max_hits;
   int num_hits;
+  float max_t;
+  bool opaque_hit;
 
   /* for SSS Rays: */
   LocalIntersection *local_isect;
   int local_object_id;
   uint *lcg_state;
 
-  CCLIntersectContext(KernelGlobals *kg_, RayType type_)
+  CCLIntersectContext(const KernelGlobals *kg_, RayType type_)
   {
     kg = kg_;
     type = type_;
     max_hits = 1;
     num_hits = 0;
+    max_t = FLT_MAX;
+    opaque_hit = false;
     isect_s = NULL;
     local_isect = NULL;
     local_object_id = -1;
@@ -98,7 +101,7 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
   rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
 }
 
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_hit(const KernelGlobals *kg,
                                                  const RTCRay *ray,
                                                  const RTCHit *hit,
                                                  Intersection *isect)
@@ -123,7 +126,7 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 }
 
-ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_sss_hit(const KernelGlobals *kg,
                                                      const RTCRay *ray,
                                                      const RTCHit *hit,
                                                      Intersection *isect,
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 4006c9c1632..90b9f410b29 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      LocalIntersection *local_isect,
                                      int local_object,
@@ -74,9 +74,9 @@ ccl_device_inline
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
     Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+    isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+    isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
     object = local_object;
   }
@@ -196,7 +196,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          LocalIntersection *local_isect,
                                          int local_object,
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 5367bdb633c..15cd0f22213 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
 
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(const KernelGlobals *kg,
                                                                 int node_addr,
                                                                 int child)
 {
@@ -28,7 +28,7 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
   return space;
 }
 
-ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(const KernelGlobals *kg,
                                                       const float3 P,
                                                       const float3 idir,
                                                       const float t,
@@ -76,7 +76,7 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(const KernelGlobals *kg,
                                                                const float3 P,
                                                                const float3 dir,
                                                                const float t,
@@ -102,7 +102,7 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg
   return tnear <= tfar;
 }
 
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(const KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 dir,
                                                         const float3 idir,
@@ -134,7 +134,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
   return mask;
 }
 
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(const KernelGlobals *kg,
                                               const float3 P,
                                               const float3 dir,
                                               const float3 idir,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 2e94b1d7c37..0ae36fccf9b 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint visibility,
@@ -68,10 +68,10 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-  int num_hits_in_instance = 0;
+  float t_world_to_instance = 1.0f;
 
   *num_hits = 0;
-  isect_array->t = tmax;
+  Intersection *isect = isect_array;
 
   /* traversal loop */
   do {
@@ -147,13 +147,14 @@ ccl_device_inline
 
             switch (p_type) {
               case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect, P, dir, isect_t, visibility, object, prim_addr);
                 break;
               }
 #if BVH_FEATURE(BVH_MOTION)
               case PRIMITIVE_MOTION_TRIANGLE: {
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 break;
               }
 #endif
@@ -163,8 +164,16 @@ ccl_device_inline
               case PRIMITIVE_CURVE_RIBBON:
               case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                hit = curve_intersect(
-                    kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                hit = curve_intersect(kg,
+                                      isect,
+                                      P,
+                                      dir,
+                                      isect_t,
+                                      visibility,
+                                      object,
+                                      prim_addr,
+                                      ray->time,
+                                      curve_type);
                 break;
               }
 #endif
@@ -176,27 +185,49 @@ ccl_device_inline
 
             /* shadow ray early termination */
             if (hit) {
+              /* Convert intersection distance to world space. */
+              isect->t /= t_world_to_instance;
+
               /* detect if this surface has a shader with transparent shadows */
 
               /* todo: optimize so primitive visibility flag indicates if
                * the primitive has a transparent shadow shader? */
-              const int flags = intersection_get_shader_flags(kg, isect_array);
+              const int flags = intersection_get_shader_flags(kg, isect);
 
-              /* if no transparent shadows, all light is blocked */
-              if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) {
-                return true;
-              }
-              /* if maximum number of hits reached, block all light */
-              else if (*num_hits == max_hits) {
+              if (!(flags & SD_HAS_TRANSPARENT_SHADOW) || max_hits == 0) {
+                /* If no transparent shadows, all light is blocked and we can
+                 * stop immediately. */
                 return true;
               }
 
-              /* move on to next entry in intersections array */
-              isect_array++;
+              /* Increase the number of hits, possibly beyond max_hits, we will
+               * simply not record those and only keep the max_hits closest. */
               (*num_hits)++;
-              num_hits_in_instance++;
 
-              isect_array->t = isect_t;
+              if (*num_hits >= max_hits) {
+                /* If maximum number of hits reached, find the intersection with
+                 * the largest distance to potentially replace when another hit
+                 * is found. */
+                const int num_recorded_hits = min(max_hits, *num_hits);
+                float max_recorded_t = isect_array[0].t;
+                int max_recorded_hit = 0;
+
+                for (int i = 1; i < num_recorded_hits; i++) {
+                  if (isect_array[i].t > max_recorded_t) {
+                    max_recorded_t = isect_array[i].t;
+                    max_recorded_hit = i;
+                  }
+                }
+
+                isect = isect_array + max_recorded_hit;
+
+                /* Limit the ray distance and stop counting hits beyond this. */
+                isect_t = max_recorded_t * t_world_to_instance;
+              }
+              else {
+                /* Still have space for intersection, use next hit. */
+                isect = isect + 1;
+              }
             }
 
             prim_addr++;
@@ -207,13 +238,14 @@ ccl_device_inline
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+          t_world_to_instance = bvh_instance_motion_push(
+              kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+          t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
+          /* Convert intersection to object space. */
+          isect_t *= t_world_to_instance;
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -228,32 +260,19 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-
 #if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+      bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+      bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #endif
 
-        /* scale isect->t to adjust for instancing */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
+      /* Restore world space ray length. If max number of hits exceeded this
+       * distance is reduced to recorded only the closest hits. If not use
+       * the original ray length. */
+      isect_t = (max_hits && *num_hits > max_hits) ? isect->t : tmax;
 
       object = OBJECT_NONE;
+      t_world_to_instance = 1.0f;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
@@ -262,7 +281,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint visibility,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 89250a8d60a..a26d8c514f3 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -31,7 +31,7 @@
  * BVH_MOTION: motion blur rendering
  */
 
-ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint visibility)
@@ -136,7 +136,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
             case PRIMITIVE_TRIANGLE: {
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
+                if (triangle_intersect(
+                        kg, isect, P, dir, isect->t, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -149,7 +150,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
                 if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
+                        kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -166,8 +167,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                const bool hit = curve_intersect(
-                    kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                const bool hit = curve_intersect(kg,
+                                                 isect,
+                                                 P,
+                                                 dir,
+                                                 isect->t,
+                                                 visibility,
+                                                 object,
+                                                 prim_addr,
+                                                 ray->time,
+                                                 curve_type);
                 if (hit) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -184,10 +193,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect->t = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+          isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+          isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
           ++stack_ptr;
@@ -218,7 +226,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 98e6ec25d15..6039e707fc3 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BVH_TYPES__
-#define __BVH_TYPES__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,5 +42,3 @@ CCL_NAMESPACE_BEGIN
 #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
 CCL_NAMESPACE_END
-
-#endif /* __BVH_TYPES__ */
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index b1faebce957..21384457b16 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -71,86 +71,6 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-/* This function should be used to compute a modified ray start position for
- * rays leaving from a surface. The algorithm slightly distorts flat surface
- * of a triangle. Surface is lifted by amount h along normal n in the incident
- * point. */
-
-ccl_device_inline float3 smooth_surface_offset(KernelGlobals *kg, ShaderData *sd, float3 Ng)
-{
-  float3 V[3], N[3];
-  triangle_vertices_and_normals(kg, sd->prim, V, N);
-
-  const float u = sd->u, v = sd->v;
-  const float w = 1 - u - v;
-  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
-  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
-
-  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
-
-  /* Parabolic approximation */
-  float a = dot(N[2] - N[0], V[0] - V[2]);
-  float b = dot(N[2] - N[1], V[1] - V[2]);
-  float c = dot(N[1] - N[0], V[1] - V[0]);
-  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
-
-  /* Check flipped normals */
-  if (dot(n, Ng) > 0) {
-    /* Local linear envelope */
-    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
-    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
-    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
-    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
-    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
-    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
-    h = max(min(min(h0, h1), h2), h * 0.5f);
-  }
-  else {
-    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
-    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
-    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
-    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
-    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
-    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
-    h = min(-min(min(h0, h1), h2), h * 0.5f);
-  }
-
-  return n * h;
-}
-
-/* Ray offset to avoid shadow terminator artifact. */
-
-ccl_device_inline float3 ray_offset_shadow(KernelGlobals *kg, ShaderData *sd, float3 L)
-{
-  float NL = dot(sd->N, L);
-  bool transmit = (NL < 0.0f);
-  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
-  float3 P = ray_offset(sd->P, Ng);
-
-  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
-    const float offset_cutoff =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
-    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
-     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
-     * make a smooth transition near the threshold. */
-    if (offset_cutoff > 0.0f) {
-      float NgL = dot(Ng, L);
-      float offset_amount = 0.0f;
-      if (NL < offset_cutoff) {
-        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
-      }
-      else {
-        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
-      }
-      if (offset_amount > 0.0f) {
-        P += smooth_surface_offset(kg, sd, Ng) * offset_amount;
-      }
-    }
-  }
-
-  return P;
-}
-
 #if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
@@ -193,10 +113,10 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 }
 #endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
-/* Utility to quickly get a shader flags from an intersection. */
+/* Utility to quickly get flags from an intersection. */
 
-ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg,
-                                                         const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
 {
   const int prim = kernel_tex_fetch(__prim_index, isect->prim);
   int shader = 0;
@@ -217,14 +137,14 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_rest
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 }
 
-ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg,
-                                                   const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_from_isect_prim(
+    const KernelGlobals *ccl_restrict kg, const int isect_prim)
 {
-  const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  const int prim = kernel_tex_fetch(__prim_index, isect_prim);
   int shader = 0;
 
 #ifdef __HAIR__
-  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+  if (kernel_tex_fetch(__prim_type, isect_prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
   {
     shader = kernel_tex_fetch(__tri_shader, prim);
@@ -239,7 +159,13 @@ ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict k
   return shader & SHADER_MASK;
 }
 
-ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict kg,
+ccl_device_forceinline int intersection_get_shader(const KernelGlobals *ccl_restrict kg,
+                                                   const Intersection *ccl_restrict isect)
+{
+  return intersection_get_shader_from_isect_prim(kg, isect->prim);
+}
+
+ccl_device_forceinline int intersection_get_object(const KernelGlobals *ccl_restrict kg,
                                                    const Intersection *ccl_restrict isect)
 {
   if (isect->object != OBJECT_NONE) {
@@ -249,4 +175,12 @@ ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict k
   return kernel_tex_fetch(__prim_object, isect->prim);
 }
 
+ccl_device_forceinline int intersection_get_object_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
+{
+  const int object = intersection_get_object(kg, isect);
+
+  return kernel_tex_fetch(__object_flag, object);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 1f2ea47269b..0411d9c522d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect,
                                      const uint visibility)
@@ -147,7 +147,7 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+                triangle_intersect(kg, isect, P, dir, isect->t, visibility, object, prim_addr);
               }
               break;
             }
@@ -165,7 +165,7 @@ ccl_device_inline
                   continue;
                 }
                 motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr);
               }
               break;
             }
@@ -181,10 +181,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+            isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+            isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             ++stack_ptr;
@@ -222,7 +221,7 @@ ccl_device_inline
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index a8664cc4331..4874270f15d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    uint BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint max_hits,
@@ -150,7 +150,8 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect_array, P, dir, isect_t, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -190,7 +191,7 @@ ccl_device_inline
                   continue;
                 }
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect_array, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -228,10 +229,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+            isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+            isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             num_hits_in_instance = 0;
@@ -289,7 +289,7 @@ ccl_device_inline
   return num_hits;
 }
 
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline uint BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint max_hits,
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 99a5a675976..72a8c2ba090 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 6f2f2ebb202..4eb8bcae997 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/closure/bsdf_ashikhmin_velvet.h"
 #include "kernel/closure/bsdf_diffuse.h"
@@ -109,7 +111,7 @@ ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multipl
   return val;
 }
 
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_inline int bsdf_sample(const KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
                                   float randu,
@@ -429,21 +431,6 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-#endif
     default:
       label = LABEL_NONE;
       break;
@@ -482,15 +469,16 @@ ccl_device
 ccl_device_inline
 #endif
     float3
-    bsdf_eval(KernelGlobals *kg,
+    bsdf_eval(const KernelGlobals *kg,
               ShaderData *sd,
               const ShaderClosure *sc,
               const float3 omega_in,
+              const bool is_transmission,
               float *pdf)
 {
-  float3 eval;
+  float3 eval = zero_float3();
 
-  if (dot(sd->N, omega_in) >= 0.0f) {
+  if (!is_transmission) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
       case CLOSURE_BSDF_BSSRDF_ID:
@@ -570,13 +558,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -663,13 +645,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -682,7 +658,7 @@ ccl_device_inline
   return eval;
 }
 
-ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
+ccl_device void bsdf_blur(const KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
   /* ToDo: do we want to blur volume closures? */
 #ifdef __SVM__
@@ -715,55 +691,4 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #endif
 }
 
-ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
-{
-#ifdef __SVM__
-  switch (a->type) {
-    case CLOSURE_BSDF_TRANSPARENT_ID:
-      return true;
-    case CLOSURE_BSDF_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_ID:
-    case CLOSURE_BSDF_TRANSLUCENT_ID:
-      return bsdf_diffuse_merge(a, b);
-    case CLOSURE_BSDF_OREN_NAYAR_ID:
-      return bsdf_oren_nayar_merge(a, b);
-    case CLOSURE_BSDF_REFLECTION_ID:
-    case CLOSURE_BSDF_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      return bsdf_microfacet_merge(a, b);
-    case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-      return bsdf_ashikhmin_velvet_merge(a, b);
-    case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-    case CLOSURE_BSDF_GLOSSY_TOON_ID:
-      return bsdf_toon_merge(a, b);
-    case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-    case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-      return bsdf_hair_merge(a, b);
-#  ifdef __PRINCIPLED__
-    case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-      return bsdf_principled_diffuse_merge(a, b);
-#  endif
-#  ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      return volume_henyey_greenstein_merge(a, b);
-#  endif
-    default:
-      return false;
-  }
-#else
-  return false;
-#endif
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9814a7cf5c9..be6383e521a 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -14,20 +14,19 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
-#define __BSDF_ASHIKHMIN_SHIRLEY_H__
-
 /*
-ASHIKHMIN SHIRLEY BSDF
-
-Implementation of
-Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
-
-The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
-the case with all other microfacet-based BSDF implementations in Cycles.
+ * ASHIKHMIN SHIRLEY BSDF
+ *
+ * Implementation of
+ * Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+ *
+ * The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+ * the case with all other microfacet-based BSDF implementations in Cycles.
+ *
+ * Other than that, the implementation directly follows the paper.
+ */
 
-Other than that, the implementation directly follows the paper.
-*/
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -240,5 +239,3 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3d3f20edab3..f51027f5701 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -30,8 +30,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_ASHIKHMIN_VELVET_H__
-#define __BSDF_ASHIKHMIN_VELVET_H__
+#pragma once
+
+#include "kernel/kernel_montecarlo.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -54,14 +55,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const VelvetBsdf *bsdf_a = (const VelvetBsdf *)a;
-  const VelvetBsdf *bsdf_b = (const VelvetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->sigma == bsdf_b->sigma);
-}
-
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc,
                                                      const float3 I,
                                                      const float3 omega_in,
@@ -175,5 +168,3 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ea604ed0311..1555aa30304 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_H__
-#define __BSDF_DIFFUSE_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -49,14 +48,6 @@ ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const DiffuseBsdf *bsdf_a = (const DiffuseBsdf *)a;
-  const DiffuseBsdf *bsdf_b = (const DiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N));
-}
-
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc,
                                             const float3 I,
                                             const float3 omega_in,
@@ -174,5 +165,3 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index aa62c1c7ceb..b06dd196b9e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_RAMP_H__
-#define __BSDF_DIFFUSE_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -125,5 +124,3 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7ca9424b815..f56f78aa1f0 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_HAIR_H__
-#define __BSDF_HAIR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,15 +61,6 @@ ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HairBsdf *bsdf_a = (const HairBsdf *)a;
-  const HairBsdf *bsdf_b = (const HairBsdf *)b;
-
-  return (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->roughness1 == bsdf_b->roughness1) &&
-         (bsdf_a->roughness2 == bsdf_b->roughness2) && (bsdf_a->offset == bsdf_b->offset);
-}
-
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc,
                                                     const float3 I,
                                                     const float3 omega_in,
@@ -309,5 +299,3 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index f12661b3095..bfe56e5ab0e 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #ifdef __KERNEL_CPU__
 #  include <fenv.h>
 #endif
 
 #include "kernel/kernel_color.h"
 
-#ifndef __BSDF_HAIR_PRINCIPLED_H__
-#  define __BSDF_HAIR_PRINCIPLED_H__
-
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledHairExtra {
@@ -181,12 +180,12 @@ ccl_device_inline float longitudinal_scattering(
 }
 
 /* Combine the three values using their luminances. */
-ccl_device_inline float4 combine_with_energy(KernelGlobals *kg, float3 c)
+ccl_device_inline float4 combine_with_energy(const KernelGlobals *kg, float3 c)
 {
   return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c));
 }
 
-#  ifdef __HAIR__
+#ifdef __HAIR__
 /* Set up the hair closure. */
 ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bsdf)
 {
@@ -226,10 +225,10 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
 
-#  endif /* __HAIR__ */
+#endif /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
-ccl_device_inline void hair_attenuation(KernelGlobals *kg, float f, float3 T, float4 *Ap)
+ccl_device_inline void hair_attenuation(const KernelGlobals *kg, float f, float3 T, float4 *Ap)
 {
   /* Primary specular (R). */
   Ap[0] = make_float4(f, f, f, f);
@@ -278,7 +277,7 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
 }
 
 /* Evaluation function for our shader. */
-ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
+ccl_device float3 bsdf_principled_hair_eval(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const ShaderClosure *sc,
                                             const float3 omega_in,
@@ -356,7 +355,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
 }
 
 /* Sampling function for the hair shader. */
-ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
+ccl_device int bsdf_principled_hair_sample(const KernelGlobals *kg,
                                            const ShaderClosure *sc,
                                            ShaderData *sd,
                                            float randu,
@@ -473,11 +472,11 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
 
   *omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i);
 
-#  ifdef __RAY_DIFFERENTIALS__
+#ifdef __RAY_DIFFERENTIALS__
   float3 N = safe_normalize(sd->I + *omega_in);
   *domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx;
   *domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy;
-#  endif
+#endif
 
   return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT);
 }
@@ -501,7 +500,7 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
   return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }
 
-ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+ccl_device float3 bsdf_principled_hair_albedo(const ShaderClosure *sc)
 {
   PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
   return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
@@ -523,5 +522,3 @@ ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const flo
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index af03bab39f7..227cb448b47 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -30,8 +30,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_MICROFACET_H__
-#define __BSDF_MICROFACET_H__
+#pragma once
+
+#include "kernel/kernel_lookup_table.h"
+#include "kernel/kernel_random.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +55,7 @@ static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf i
 
 /* Beckmann and GGX microfacet importance sampling. */
 
-ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg,
+ccl_device_inline void microfacet_beckmann_sample_slopes(const KernelGlobals *kg,
                                                          const float cos_theta_i,
                                                          const float sin_theta_i,
                                                          float randu,
@@ -193,7 +195,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
   *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
 }
 
-ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals *kg,
+ccl_device_forceinline float3 microfacet_sample_stretched(const KernelGlobals *kg,
                                                           const float3 omega_i,
                                                           const float alpha_x,
                                                           const float alpha_y,
@@ -352,21 +354,6 @@ ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const S
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf *)a;
-  const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->alpha_x == bsdf_b->alpha_x) &&
-         (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
-         (bsdf_a->ior == bsdf_b->ior) &&
-         ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
-          ((bsdf_a->extra && bsdf_b->extra) &&
-           (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) &&
-           (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) &&
-           (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
-}
-
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->extra = NULL;
@@ -558,7 +545,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc,
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_ggx_sample(const KernelGlobals *kg,
                                           const ShaderClosure *sc,
                                           float3 Ng,
                                           float3 I,
@@ -986,7 +973,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_beckmann_sample(const KernelGlobals *kg,
                                                const ShaderClosure *sc,
                                                float3 Ng,
                                                float3 I,
@@ -1175,5 +1162,3 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 9795c8da065..68d5071dbce 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Most of the code is based on the supplemental implementations from
@@ -466,7 +468,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
                         bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_sample(const KernelGlobals *kg,
                                                 const ShaderClosure *sc,
                                                 float3 Ng,
                                                 float3 I,
@@ -628,7 +630,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
                        bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_glass_sample(const KernelGlobals *kg,
                                                       const ShaderClosure *sc,
                                                       float3 Ng,
                                                       float3 I,
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 41e5736bf49..be12d47f0ea 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_OREN_NAYAR_H__
-#define __BSDF_OREN_NAYAR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -61,14 +60,6 @@ ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf *)a;
-  const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc,
                                                const float3 I,
                                                const float3 omega_in,
@@ -127,5 +118,3 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index cf5484383f2..43f8cf71c59 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_PHONG_RAMP_H__
-#define __BSDF_PHONG_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -153,5 +152,3 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index d5d012068ff..a72af519482 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
-#define __BSDF_PRINCIPLED_DIFFUSE_H__
+#pragma once
 
 /* DISNEY PRINCIPLED DIFFUSE BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledDiffuseBsdf {
@@ -61,14 +62,6 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf *)a;
-  const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc,
                                                        const float3 I,
                                                        const float3 omega_in,
@@ -136,5 +129,3 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 3707de29d73..60ce7e4eb75 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_SHEEN_H__
-#define __BSDF_PRINCIPLED_SHEEN_H__
+#pragma once
 
 /* DISNEY PRINCIPLED SHEEN BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledSheenBsdf {
@@ -137,5 +138,3 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index c24ba170915..31283971d5a 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFLECTION_H__
-#define __BSDF_REFLECTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -93,5 +92,3 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index d4fbe86dac0..cfedb5dfe2c 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFRACTION_H__
-#define __BSDF_REFRACTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -111,5 +110,3 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index cc5de21ed0e..acdafe0f735 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TOON_H__
-#define __BSDF_TOON_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -55,15 +54,6 @@ ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const ToonBsdf *bsdf_a = (const ToonBsdf *)a;
-  const ToonBsdf *bsdf_b = (const ToonBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->size == bsdf_b->size) &&
-         (bsdf_a->smooth == bsdf_b->smooth);
-}
-
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
   float is;
@@ -248,5 +238,3 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 4e5513499e8..f1dc7efb345 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TRANSPARENT_H__
-#define __BSDF_TRANSPARENT_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -123,5 +122,3 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index a73dee1b045..beec5f768a1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_UTIL_H__
-#define __BSDF_UTIL_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -150,5 +149,3 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 562daf1286d..0f9278bba89 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_BSSRDF_H__
-#define __KERNEL_BSSRDF_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,310 +23,71 @@ typedef ccl_addr_space struct Bssrdf {
 
   float3 radius;
   float3 albedo;
-  float sharpness;
-  float texture_blur;
   float roughness;
-  float channels;
+  float anisotropy;
 } Bssrdf;
 
 static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!");
 
-/* Planar Truncated Gaussian
- *
- * Note how this is different from the typical gaussian, this one integrates
- * to 1 over the plane (where you get an extra 2*pi*x factor). We are lucky
- * that integrating x*exp(-x) gives a nice closed form solution. */
-
-/* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
-#define GAUSS_TRUNCATE 12.46f
-
-ccl_device float bssrdf_gaussian_eval(const float radius, float r)
-{
-  /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
-   * = 1 - exp(-Rm*Rm/(2*v)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  if (r >= Rm)
-    return 0.0f;
-
-  return expf(-r * r / (2.0f * v)) / (2.0f * M_PI_F * v);
-}
-
-ccl_device float bssrdf_gaussian_pdf(const float radius, float r)
+ccl_device float bssrdf_dipole_compute_Rd(float alpha_prime, float fourthirdA)
 {
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  return bssrdf_gaussian_eval(radius, r) * (1.0f / (area_truncated));
+  float s = sqrtf(3.0f * (1.0f - alpha_prime));
+  return 0.5f * alpha_prime * (1.0f + expf(-fourthirdA * s)) * expf(-s);
 }
 
-ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h)
+ccl_device float bssrdf_dipole_compute_alpha_prime(float rd, float fourthirdA)
 {
-  /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
-   * r = sqrt(-2*v*logf(xi)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  /* r(xi) */
-  const float r_squared = -2.0f * v * logf(1.0f - xi * area_truncated);
-  *r = sqrtf(r_squared);
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_squared);
-}
-
-/* Planar Cubic BSSRDF falloff
- *
- * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
- * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
- * far as I can tell has no closed form solution. So we get an iterative solution
- * instead with newton-raphson. */
-
-ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r)
-{
-  if (sharpness == 0.0f) {
-    const float Rm = radius;
-
-    if (r >= Rm)
-      return 0.0f;
-
-    /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
-    const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
-    const float f = Rm - r;
-    const float num = f * f * f;
-
-    return (10.0f * num) / (Rm5 * M_PI_F);
+  /* Little Newton solver. */
+  if (rd < 1e-4f) {
+    return 0.0f;
+  }
+  if (rd >= 0.995f) {
+    return 0.999999f;
   }
-  else {
-    float Rm = radius * (1.0f + sharpness);
-
-    if (r >= Rm)
-      return 0.0f;
 
-    /* custom variation with extra sharpness, to match the previous code */
-    const float y = 1.0f / (1.0f + sharpness);
-    float Rmy, ry, ryinv;
+  float x0 = 0.0f;
+  float x1 = 1.0f;
+  float xmid, fmid;
 
-    if (sharpness == 1.0f) {
-      Rmy = sqrtf(Rm);
-      ry = sqrtf(r);
-      ryinv = (ry > 0.0f) ? 1.0f / ry : 0.0f;
+  constexpr const int max_num_iterations = 12;
+  for (int i = 0; i < max_num_iterations; ++i) {
+    xmid = 0.5f * (x0 + x1);
+    fmid = bssrdf_dipole_compute_Rd(xmid, fourthirdA);
+    if (fmid < rd) {
+      x0 = xmid;
     }
     else {
-      Rmy = powf(Rm, y);
-      ry = powf(r, y);
-      ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
+      x1 = xmid;
     }
-
-    const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
-    const float f = Rmy - ry;
-    const float num = f * (f * f) * (y * ryinv);
-
-    return (10.0f * num) / (Rmy5 * M_PI_F);
-  }
-}
-
-ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r)
-{
-  return bssrdf_cubic_eval(radius, sharpness, r);
-}
-
-/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
-{
-  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
-   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
-   * should not be too bad */
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  float x = 0.25f;
-  int i;
-
-  for (i = 0; i < max_iteration_count; i++) {
-    float x2 = x * x;
-    float x3 = x2 * x;
-    float nx = (1.0f - x);
-
-    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
-    float f_ = 20.0f * (x * nx) * (nx * nx);
-
-    if (fabsf(f) < tolerance || f_ == 0.0f)
-      break;
-
-    x = saturate(x - f / f_);
   }
 
-  return x;
+  return xmid;
 }
 
-ccl_device void bssrdf_cubic_sample(
-    const float radius, const float sharpness, float xi, float *r, float *h)
+ccl_device void bssrdf_setup_radius(Bssrdf *bssrdf, const ClosureType type, const float eta)
 {
-  float Rm = radius;
-  float r_ = bssrdf_cubic_quintic_root_find(xi);
-
-  if (sharpness != 0.0f) {
-    r_ = powf(r_, 1.0f + sharpness);
-    Rm *= (1.0f + sharpness);
-  }
-
-  r_ *= Rm;
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Approximate Reflectance Profiles
- * http://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf
- */
-
-/* This is a bit arbitrary, just need big enough radius so it matches
- * the mean free length, but still not too big so sampling is still
- * effective. Might need some further tweaks.
- */
-#define BURLEY_TRUNCATE 16.0f
-#define BURLEY_TRUNCATE_CDF 0.9963790093708328f  // cdf(BURLEY_TRUNCATE)
-
-ccl_device_inline float bssrdf_burley_fitting(float A)
-{
-  /* Diffuse surface transmission, equation (6). */
-  return 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-}
-
-/* Scale mean free path length so it gives similar looking result
- * to Cubic and Gaussian models.
- */
-ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
-{
-  return 0.25f * M_1_PI_F * r;
-}
-
-ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
-{
-  /* Mean free path length. */
-  const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
-  /* Surface albedo. */
-  const float3 A = bssrdf->albedo;
-  const float3 s = make_float3(
-      bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z));
-
-  bssrdf->radius = l / s;
-}
-
-ccl_device float bssrdf_burley_eval(const float d, float r)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-
-  if (r >= Rm)
-    return 0.0f;
-
-  /* Burley reflectance profile, equation (3).
-   *
-   * NOTES:
-   * - Surface albedo is already included into sc->weight, no need to
-   *   multiply by this term here.
-   * - This is normalized diffuse model, so the equation is multiplied
-   *   by 2*pi, which also matches cdf().
-   */
-  float exp_r_3_d = expf(-r / (3.0f * d));
-  float exp_r_d = exp_r_3_d * exp_r_3_d * exp_r_3_d;
-  return (exp_r_d + exp_r_3_d) / (4.0f * d);
-}
-
-ccl_device float bssrdf_burley_pdf(const float d, float r)
-{
-  return bssrdf_burley_eval(d, r) * (1.0f / BURLEY_TRUNCATE_CDF);
-}
-
-/* Find the radius for desired CDF value.
- * Returns scaled radius, meaning the result is to be scaled up by d.
- * Since there's no closed form solution we do Newton-Raphson method to find it.
- */
-ccl_device_forceinline float bssrdf_burley_root_find(float xi)
-{
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  /* Do initial guess based on manual curve fitting, this allows us to reduce
-   * number of iterations to maximum 4 across the [0..1] range. We keep maximum
-   * number of iteration higher just to be sure we didn't miss root in some
-   * corner case.
-   */
-  float r;
-  if (xi <= 0.9f) {
-    r = expf(xi * xi * 2.4f) - 1.0f;
+  if (type == CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) {
+    /* Scale mean free path length so it gives similar looking result to older
+     * Cubic, Gaussian and Burley models. */
+    bssrdf->radius *= 0.25f * M_1_PI_F;
   }
   else {
-    /* TODO(sergey): Some nicer curve fit is possible here. */
-    r = 15.0f;
-  }
-  /* Solve against scaled radius. */
-  for (int i = 0; i < max_iteration_count; i++) {
-    float exp_r_3 = expf(-r / 3.0f);
-    float exp_r = exp_r_3 * exp_r_3 * exp_r_3;
-    float f = 1.0f - 0.25f * exp_r - 0.75f * exp_r_3 - xi;
-    float f_ = 0.25f * exp_r + 0.25f * exp_r_3;
+    /* Adjust radius based on IOR and albedo. */
+    const float inv_eta = 1.0f / eta;
+    const float F_dr = inv_eta * (-1.440f * inv_eta + 0.710f) + 0.668f + 0.0636f * eta;
+    const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) /
+                             (1.0f - F_dr); /* From Jensen's Fdr ratio formula. */
 
-    if (fabsf(f) < tolerance || f_ == 0.0f) {
-      break;
-    }
+    const float3 alpha_prime = make_float3(
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA));
 
-    r = r - f / f_;
-    if (r < 0.0f) {
-      r = 0.0f;
-    }
+    bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime));
   }
-  return r;
 }
 
-ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-  const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* None BSSRDF falloff
- *
- * Samples distributed over disk with no falloff, for reference. */
-
-ccl_device float bssrdf_none_eval(const float radius, float r)
-{
-  const float Rm = radius;
-  return (r < Rm) ? 1.0f : 0.0f;
-}
-
-ccl_device float bssrdf_none_pdf(const float radius, float r)
-{
-  /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
-  const float Rm = radius;
-  const float area = (M_PI_F * Rm * Rm);
-
-  return bssrdf_none_eval(radius, r) / area;
-}
-
-ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h)
-{
-  /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
-   * r = sqrt(xi)*Rm */
-  const float Rm = radius;
-  const float r_ = sqrtf(xi) * Rm;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Generic */
+/* Setup */
 
 ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
@@ -342,7 +102,7 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
   return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
 }
 
-ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
+ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type, const float ior)
 {
   int flag = 0;
   int bssrdf_channels = 3;
@@ -371,7 +131,7 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   if (bssrdf_channels < 3) {
     /* Add diffuse BSDF if any radius too small. */
 #ifdef __PRINCIPLED__
-    if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
+    if (bssrdf->roughness != FLT_MAX) {
       float roughness = bssrdf->roughness;
       float3 N = bssrdf->N;
 
@@ -401,16 +161,9 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   /* Setup BSSRDF if radius is large enough. */
   if (bssrdf_channels > 0) {
     bssrdf->type = type;
-    bssrdf->channels = bssrdf_channels;
-    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels;
-    bssrdf->texture_blur = saturate(bssrdf->texture_blur);
-    bssrdf->sharpness = saturate(bssrdf->sharpness);
+    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf_channels;
 
-    if (type == CLOSURE_BSSRDF_BURLEY_ID || type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
-        type == CLOSURE_BSSRDF_RANDOM_WALK_ID ||
-        type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-      bssrdf_burley_setup(bssrdf);
-    }
+    bssrdf_setup_radius(bssrdf, type, ior);
 
     flag |= SD_BSSRDF;
   }
@@ -422,77 +175,4 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   return flag;
 }
 
-ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float radius;
-
-  /* Sample color channel and reuse random number. Only a subset of channels
-   * may be used if their radius was too small to handle as BSSRDF. */
-  xi *= bssrdf->channels;
-
-  if (xi < 1.0f) {
-    radius = (bssrdf->radius.x > 0.0f) ? bssrdf->radius.x :
-             (bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                         bssrdf->radius.z;
-  }
-  else if (xi < 2.0f) {
-    xi -= 1.0f;
-    radius = (bssrdf->radius.x > 0.0f && bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                                                    bssrdf->radius.z;
-  }
-  else {
-    xi -= 2.0f;
-    radius = bssrdf->radius.z;
-  }
-
-  /* Sample BSSRDF. */
-  if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    bssrdf_gaussian_sample(radius, xi, r, h);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */
-    bssrdf_burley_sample(radius, xi, r, h);
-  }
-}
-
-ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
-{
-  if (radius == 0.0f) {
-    return 0.0f;
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    return bssrdf_gaussian_pdf(radius, r);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
-    return bssrdf_burley_pdf(radius, r);
-  }
-}
-
-ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-
-  return make_float3(bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r));
-}
-
-ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float3 pdf = bssrdf_eval(sc, r);
-
-  return (pdf.x + pdf.y + pdf.z) / bssrdf->channels;
-}
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 911382e6865..a2519d97618 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -30,6 +30,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* BACKGROUND CLOSURE */
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 1430f712701..69959a3f21b 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __VOLUME_H__
-#define __VOLUME_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,21 +61,12 @@ ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume)
   return SD_SCATTER;
 }
 
-ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume *)a;
-  const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume *)b;
-
-  return (volume_a->g == volume_b->g);
-}
-
-ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc,
+ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderVolumeClosure *svc,
                                                       const float3 I,
                                                       float3 omega_in,
                                                       float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer */
   if (fabsf(g) < 1e-3f) {
@@ -122,7 +112,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pd
   return dir;
 }
 
-ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
+ccl_device int volume_henyey_greenstein_sample(const ShaderVolumeClosure *svc,
                                                float3 I,
                                                float3 dIdx,
                                                float3 dIdy,
@@ -134,8 +124,7 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
                                                float3 *domega_in_dy,
                                                float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer and so is used negated */
   *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
@@ -153,17 +142,15 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
 /* VOLUME CLOSURE */
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd,
-                                    const ShaderClosure *sc,
+                                    const ShaderVolumeClosure *svc,
                                     float3 omega_in,
                                     float *pdf)
 {
-  kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
-
-  return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+  return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd,
-                                   const ShaderClosure *sc,
+                                   const ShaderVolumeClosure *svc,
                                    float randu,
                                    float randv,
                                    float3 *eval,
@@ -171,31 +158,65 @@ ccl_device int volume_phase_sample(const ShaderData *sd,
                                    differential3 *domega_in,
                                    float *pdf)
 {
-  int label;
-
-  switch (sc->type) {
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-    default:
-      *eval = make_float3(0.0f, 0.0f, 0.0f);
-      label = LABEL_NONE;
-      break;
+  return volume_henyey_greenstein_sample(svc,
+                                         sd->I,
+                                         sd->dI.dx,
+                                         sd->dI.dy,
+                                         randu,
+                                         randv,
+                                         eval,
+                                         omega_in,
+                                         &domega_in->dx,
+                                         &domega_in->dy,
+                                         pdf);
+}
+
+/* Volume sampling utilities. */
+
+/* todo: this value could be tweaked or turned into a probability to avoid
+ * unnecessary work in volumes and subsurface scattering. */
+#define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+ccl_device float3 volume_color_transmittance(float3 sigma, float t)
+{
+  return exp3(-sigma * t);
+}
+
+ccl_device float volume_channel_get(float3 value, int channel)
+{
+  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
+}
+
+ccl_device int volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf)
+{
+  /* Sample color channel proportional to throughput and single scattering
+   * albedo, to significantly reduce noise with many bounce, following:
+   *
+   * "Practical and Controllable Subsurface Scattering for Production Path
+   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+  float3 weights = fabs(throughput * albedo);
+  float sum_weights = weights.x + weights.y + weights.z;
+  float3 weights_pdf;
+
+  if (sum_weights > 0.0f) {
+    weights_pdf = weights / sum_weights;
   }
+  else {
+    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
+  }
+
+  *pdf = weights_pdf;
 
-  return label;
+  /* OpenCL does not support -> on float3, so don't use pdf->x. */
+  if (rand < weights_pdf.x) {
+    return 0;
+  }
+  else if (rand < weights_pdf.x + weights_pdf.y) {
+    return 1;
+  }
+  else {
+    return 2;
+  }
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/device/cpu/compat.h
index 88f6a264a5a..bfd936c7bbd 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CPU_H__
-#define __KERNEL_COMPAT_CPU_H__
+#pragma once
 
 #define __KERNEL_CPU__
 
@@ -27,14 +26,6 @@
 #  pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #include "util/util_half.h"
 #include "util/util_math.h"
 #include "util/util_simd.h"
@@ -43,15 +34,6 @@
 
 #define ccl_addr_space
 
-#define ccl_local_id(d) 0
-#define ccl_global_id(d) (kg->global_id[d])
-
-#define ccl_local_size(d) 1
-#define ccl_global_size(d) (kg->global_size[d])
-
-#define ccl_group_id(d) ccl_global_id(d)
-#define ccl_num_groups(d) ccl_global_size(d)
-
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -72,37 +54,11 @@ CCL_NAMESPACE_BEGIN
  * simple arrays and after inlining fetch hopefully revert to being a simple
  * pointer lookup. */
 template<typename T> struct texture {
-  ccl_always_inline const T &fetch(int index)
+  ccl_always_inline const T &fetch(int index) const
   {
     kernel_assert(index >= 0 && index < width);
     return data[index];
   }
-#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-  /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
-   * compatibility with existing indices and data structures.
-   */
-  ccl_always_inline avxf fetch_avxf(const int index)
-  {
-    kernel_assert(index >= 0 && (index + 1) < width);
-    ssef *ssef_data = (ssef *)data;
-    ssef *ssef_node_data = &ssef_data[index];
-    return _mm256_loadu_ps((float *)ssef_node_data);
-  }
-#endif
-
-#ifdef __KERNEL_SSE2__
-  ccl_always_inline ssef fetch_ssef(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssef *)data)[index];
-  }
-
-  ccl_always_inline ssei fetch_ssei(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssei *)data)[index];
-  }
-#endif
 
   T *data;
   int width;
@@ -110,15 +66,6 @@ template<typename T> struct texture {
 
 /* Macros to handle different memory storage on different devices */
 
-#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
-#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
-#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
-#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_array(tex) (kg->tex.data)
-
-#define kernel_data (kg->__data)
-
 #ifdef __KERNEL_SSE2__
 typedef vector3<sseb> sse3b;
 typedef vector3<ssef> sse3f;
@@ -152,5 +99,3 @@ typedef vector3<avxf> avx3f;
 #endif
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+  KernelData __data;
+
+#ifdef __OSL__
+  /* On the CPU, we also have the OSL globals here. Most data structures are shared
+   * with SVM, the difference is in the shaders and object/mesh attributes. */
+  OSLGlobals *osl;
+  OSLShadingSystem *osl_ss;
+  OSLThreadData *osl_tdata;
+#endif
+
+  /* **** Run-time data ****  */
+
+  ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/device/cpu/image.h
index 59b96c86c50..57e81ab186d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_CPU_IMAGE_H__
-#define __KERNEL_CPU_IMAGE_H__
+#pragma once
 
 #ifdef WITH_NANOVDB
 #  define NANOVDB_USE_INTRINSICS
@@ -584,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
 
 #undef SET_CUBIC_SPLINE_WEIGHTS
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -612,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -656,5 +655,3 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
 } /* Namespace. */
 
 CCL_NAMESPACE_END
-
-#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 8040bfb7b33..ac1cdf5fffe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -56,9 +56,9 @@
 /* do nothing */
 #endif
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index b907c6a2bac..ae2a841835a 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -14,50 +14,49 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_H__
-#define __KERNEL_H__
+#pragma once
 
 /* CPU Kernel Interface */
 
-#include "kernel/kernel_types.h"
 #include "util/util_types.h"
 
+#include "kernel/kernel_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 #define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
 #define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
+struct IntegratorStateCPU;
 struct KernelGlobals;
 struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
 
-void *kernel_osl_memory(KernelGlobals *kg);
-bool kernel_osl_use(KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
 void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+#    include "kernel/device/cpu/globals.h"
+#    include "kernel/device/cpu/image.h"
+
+#    include "kernel/integrator/integrator_state.h"
+#    include "kernel/integrator/integrator_state_flow.h"
+#    include "kernel/integrator/integrator_state_util.h"
+
+#    include "kernel/integrator/integrator_init_from_camera.h"
+#    include "kernel/integrator/integrator_init_from_bake.h"
+#    include "kernel/integrator/integrator_intersect_closest.h"
+#    include "kernel/integrator/integrator_intersect_shadow.h"
+#    include "kernel/integrator/integrator_intersect_subsurface.h"
+#    include "kernel/integrator/integrator_intersect_volume_stack.h"
+#    include "kernel/integrator/integrator_shade_background.h"
+#    include "kernel/integrator/integrator_shade_light.h"
+#    include "kernel/integrator/integrator_shade_shadow.h"
+#    include "kernel/integrator/integrator_shade_surface.h"
+#    include "kernel/integrator/integrator_shade_volume.h"
+#    include "kernel/integrator/integrator_megakernel.h"
+
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_adaptive_sampling.h"
+#    include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+#  define STUB_ASSERT(arch, name) \
+    assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif   /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+#  define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+#  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state) \
+  { \
+    KERNEL_INVOKE(name, kg, state); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, state, render_buffer); \
+  }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+  kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+  kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+  return false;
+#else
+  return kernel_adaptive_sampling_convergence_check(
+      kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+#    ifdef __BAKING__
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#    endif
+#  endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 5f6b6800363..220768036ab 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -34,6 +34,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 97e8fc25140..90c05113cbe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -35,6 +35,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index 26d7fd4de48..fb85ef5b0d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -29,6 +29,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 3f259aa4480..87baf04258a 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -31,6 +31,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index 68bae8c07c6..bb421d58815 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -32,6 +32,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/device/cuda/compat.h
index ea3b78b7cef..665da43e1a1 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CUDA_H__
-#define __KERNEL_COMPAT_CUDA_H__
+#pragma once
 
 #define __KERNEL_GPU__
 #define __KERNEL_CUDA__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
 #endif
 
 /* Manual definitions so we can compile without CUDA toolkit. */
@@ -38,8 +33,6 @@ typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -47,14 +40,7 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Qualifier wrappers for different names on different devices */
+/* Qualifiers */
 
 #define ccl_device __device__ __inline__
 #if __CUDA_ARCH__ < 500
@@ -68,104 +54,61 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local __shared__
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
-/* TODO(sergey): In theory we might use references with CUDA, however
- * performance impact yet to be investigated.
- */
-#define ccl_ref
 #define ccl_align(n) __align__(n)
 #define ccl_optional_struct_init
 
-#define ATTR_FALLTHROUGH
-
-#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH)
-
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
-/* Types */
+/* GPU thread, block, grid size and index */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
 
-/* Work item functions */
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-ccl_device_inline uint ccl_local_id(uint d)
-{
-  switch (d) {
-    case 0:
-      return threadIdx.x;
-    case 1:
-      return threadIdx.y;
-    case 2:
-      return threadIdx.z;
-    default:
-      return 0;
-  }
-}
+/* GPU warp synchronizaton */
 
-#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
 
-ccl_device_inline uint ccl_local_size(uint d)
-{
-  switch (d) {
-    case 0:
-      return blockDim.x;
-    case 1:
-      return blockDim.y;
-    case 2:
-      return blockDim.z;
-    default:
-      return 0;
-  }
-}
+/* GPU texture objects */
 
-#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
 
-ccl_device_inline uint ccl_group_id(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
 {
-  switch (d) {
-    case 0:
-      return blockIdx.x;
-    case 1:
-      return blockIdx.y;
-    case 2:
-      return blockIdx.z;
-    default:
-      return 0;
-  }
+  return tex2D<T>(texobj, x, y);
 }
 
-ccl_device_inline uint ccl_num_groups(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
 {
-  switch (d) {
-    case 0:
-      return gridDim.x;
-    case 1:
-      return gridDim.y;
-    case 2:
-      return gridDim.z;
-    default:
-      return 0;
-  }
+  return tex3D<T>(texobj, x, y, z);
 }
 
-/* Textures */
-
-/* Use arrays for regular data. */
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-
-#define kernel_data __data
-
 /* Use fast math functions */
 
 #define cosf(x) __cosf(((float)(x)))
@@ -175,4 +118,18 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
 
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+#    define GPU_KERNEL_MAX_REGISTERS 64
+#  else
+#    define GPU_KERNEL_MAX_REGISTERS 48
+#  endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
+#  define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/device/cuda/kernel.cu
index 84938b889e5..e26fe243642 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2013 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
+/* CUDA kernel entry points */
 
-#define KERNEL_NAME indirect_subsurface
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#ifdef __CUDA_ARCH__
 
+#  include "kernel/device/cuda/compat.h"
+#  include "kernel/device/cuda/config.h"
+#  include "kernel/device/cuda/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/device/gpu/image.h
index 132653fa7ca..b015c78a8f5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
 #ifdef WITH_NANOVDB
 #  define NDEBUG /* Disable "assert" in device code */
 #  define NANOVDB_USE_INTRINSICS
@@ -61,9 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -81,15 +85,18 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
   float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
   float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
 
-  return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
-         cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
+  return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+         cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
 }
 
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -117,10 +124,14 @@ ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x,
   float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
   float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
 
-  return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
-         g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1)));
+  return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+         g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
 }
 
 #ifdef WITH_NANOVDB
@@ -157,7 +168,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 }
 
 template<typename T>
-ccl_device_inline T kernel_tex_image_interp_nanovdb(
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
     const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
@@ -178,7 +189,7 @@ ccl_device_inline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -190,8 +201,8 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       return kernel_tex_image_interp_bicubic<float4>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex2D<float4>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
     }
   }
   /* float, byte and half */
@@ -202,15 +213,15 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       f = kernel_tex_image_interp_bicubic<float>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex2D<float>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -245,8 +256,8 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex3D<float4>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
     }
   }
   else {
@@ -256,10 +267,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex3D<float>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_reset(int num_states)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  if (state < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+                                           const int num_tiles,
+                                           float *render_buffer,
+                                           const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+                                         const int num_tiles,
+                                         float *render_buffer,
+                                         const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_closest(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_shadow(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_subsurface(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_volume_stack(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_background(const int *path_index_array,
+                                           float *render_buffer,
+                                           const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_background(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_light(const int *path_index_array,
+                                      float *render_buffer,
+                                      const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_light(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_shadow(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface(const int *path_index_array,
+                                        float *render_buffer,
+                                        const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+                                                 float *render_buffer,
+                                                 const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_volume(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_volume(NULL, state, render_buffer);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_paths_array(int num_states,
+                                             int *indices,
+                                             int *num_indices,
+                                             int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+                                                    int *indices,
+                                                    int *num_indices,
+                                                    int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_paths_array(int num_states,
+                                                 int *indices,
+                                                 int *num_indices,
+                                                 int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_sorted_paths_array(
+        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(path, shader_sort_key) :
+                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_paths_array(int num_states,
+                                              int *indices,
+                                              int *num_indices,
+                                              int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+                                         const int active_states_offset,
+                                         const int terminated_states_offset,
+                                         const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_state_move(to_state, from_state);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_prefix_sum(int *values, int num_values)
+{
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+                                                   int sx,
+                                                   int sy,
+                                                   int sw,
+                                                   int sh,
+                                                   float threshold,
+                                                   bool reset,
+                                                   int offset,
+                                                   int stride,
+                                                   uint *num_active_pixels)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index - y * sw;
+
+  bool converged = true;
+
+  if (x < sw && y < sh) {
+    converged = kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_x(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int y = ccl_gpu_global_id_x();
+
+  if (y < sh) {
+    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_y(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+  const int pixel_index = ccl_gpu_global_id_x();
+
+  if (pixel_index < num_pixels) {
+    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                      float *pixels,
+                                                      float *render_buffer,
+                                                      int num_pixels,
+                                                      int width,
+                                                      int offset,
+                                                      int stride,
+                                                      int dst_offset,
+                                                      int dst_stride,
+                                                      const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels +
+                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+  float pixel[4];
+  processor(kfilm_convert, buffer, pixel);
+
+  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        processor(kfilm_convert, buffer, pixel_rgba);
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        float value;
+        processor(kfilm_convert, buffer, &value);
+
+        pixel_rgba[0] = value;
+        pixel_rgba[1] = value;
+        pixel_rgba[2] = value;
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+  (const KernelFilmConvert kfilm_convert, \
+   float *pixels, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_common(&kfilm_convert, \
+                                   pixels, \
+                                   render_buffer, \
+                                   num_pixels, \
+                                   width, \
+                                   offset, \
+                                   stride, \
+                                   rgba_offset, \
+                                   rgba_stride, \
+                                   film_get_pass_pixel_##variant); \
+  } \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+  (const KernelFilmConvert kfilm_convert, \
+   uchar4 *rgba, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+                                                        rgba, \
+                                                        render_buffer, \
+                                                        num_pixels, \
+                                                        width, \
+                                                        offset, \
+                                                        stride, \
+                                                        rgba_offset, \
+                                                        rgba_stride, \
+                                                        film_get_pass_pixel_##variant); \
+  }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+                                    float4 *output,
+                                    const int offset,
+                                    const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_displace_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+                                      float4 *output,
+                                      const int offset,
+                                      const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_background_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_preprocess(float *render_buffer,
+                                       int full_x,
+                                       int full_y,
+                                       int width,
+                                       int height,
+                                       int offset,
+                                       int stride,
+                                       int pass_stride,
+                                       int pass_denoised)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float *color_out = buffer + pass_denoised;
+  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+                                         int guiding_pass_stride,
+                                         int guiding_pass_albedo,
+                                         int guiding_pass_normal,
+                                         const float *render_buffer,
+                                         int render_offset,
+                                         int render_stride,
+                                         int render_pass_stride,
+                                         int render_pass_sample_count,
+                                         int render_pass_denoising_albedo,
+                                         int render_pass_denoising_normal,
+                                         int full_x,
+                                         int full_y,
+                                         int width,
+                                         int height,
+                                         int num_samples)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+  float pixel_scale;
+  if (render_pass_sample_count == PASS_UNUSED) {
+    pixel_scale = 1.0f / num_samples;
+  }
+  else {
+    pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+  }
+
+  /* Albedo pass. */
+  if (guiding_pass_albedo != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+    const float *aledo_in = buffer + render_pass_denoising_albedo;
+    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+    albedo_out[0] = aledo_in[0] * pixel_scale;
+    albedo_out[1] = aledo_in[1] * pixel_scale;
+    albedo_out[2] = aledo_in[2] * pixel_scale;
+  }
+
+  /* Normal pass. */
+  if (render_pass_denoising_normal != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+    const float *normal_in = buffer + render_pass_denoising_normal;
+    float *normal_out = guiding_pixel + guiding_pass_normal;
+
+    normal_out[0] = normal_in[0] * pixel_scale;
+    normal_out[1] = normal_in[1] * pixel_scale;
+    normal_out[2] = normal_in[2] * pixel_scale;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+                                              int guiding_pass_stride,
+                                              int guiding_pass_albedo,
+                                              int width,
+                                              int height)
+{
+  kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+  albedo_out[0] = 0.5f;
+  albedo_out[1] = 0.5f;
+  albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_postprocess(float *render_buffer,
+                                        int full_x,
+                                        int full_y,
+                                        int width,
+                                        int height,
+                                        int offset,
+                                        int stride,
+                                        int pass_stride,
+                                        int num_samples,
+                                        int pass_noisy,
+                                        int pass_denoised,
+                                        int pass_sample_count,
+                                        int num_components,
+                                        bool use_compositing)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float pixel_scale;
+  if (pass_sample_count == PASS_UNUSED) {
+    pixel_scale = num_samples;
+  }
+  else {
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+  }
+
+  float *denoised_pixel = buffer + pass_denoised;
+
+  denoised_pixel[0] *= pixel_scale;
+  denoised_pixel[1] *= pixel_scale;
+  denoised_pixel[2] *= pixel_scale;
+
+  if (num_components == 3) {
+    /* Pass without alpha channel. */
+  }
+  else if (!use_compositing) {
+    /* Currently compositing passes are either 3-component (derived by dividing light passes)
+     * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+     * simplifies logic and avoids extra memory allocation. */
+    const float *noisy_pixel = buffer + pass_noisy;
+    denoised_pixel[3] = noisy_pixel[3];
+  }
+  else {
+    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+     * is an opaque pixel for 4 component passes. */
+
+    denoised_pixel[3] = 0;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+                                                               uint *num_possible_splits)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  bool can_split = false;
+
+  if (state < num_states) {
+    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+  }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                IsActiveOp is_active_op)
+{
+  extern ccl_gpu_shared int warp_offset[];
+
+  const uint thread_index = ccl_gpu_thread_idx_x;
+  const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+  const uint warp_index = thread_index / ccl_gpu_warp_size;
+  const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+    return;
+  }
+
+  int offset = 0;
+  for (int i = 0; i < num_values; i++) {
+    const int new_offset = offset + values[i];
+    values[i] = offset;
+    offset = new_offset;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially.  This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+    const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+  extern ccl_gpu_shared OutputT shared_data[];
+
+  const uint tid = ccl_gpu_thread_idx_x;
+  const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+  OutputT sum = zero;
+  for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+    sum += convert(input_data[i]);
+  }
+  shared_data[tid] = sum;
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 512 && tid < 256) {
+    shared_data[tid] = sum = sum + shared_data[tid + 256];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 256 && tid < 128) {
+    shared_data[tid] = sum = sum + shared_data[tid + 128];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 128 && tid < 64) {
+    shared_data[tid] = sum = sum + shared_data[tid + 64];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 64 && tid < 32) {
+    shared_data[tid] = sum = sum + shared_data[tid + 32];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (tid < 32) {
+    for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+      sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+    }
+  }
+
+  if (tid == 0) {
+    output_sum[ccl_gpu_block_idx_x] = sum;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_prefix_sum,
+                                                GetKeyOp get_key_op)
+{
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+  const int key = (state_index < num_states) ? get_key_op(state_index) :
+                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+    indices[index] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/device/optix/compat.h
index 064c99ca100..4e255a135c6 100644
--- a/intern/cycles/kernel/kernel_compat_optix.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -15,14 +15,13 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_OPTIX_H__
-#define __KERNEL_COMPAT_OPTIX_H__
+#pragma once
 
 #define OPTIX_DONT_INCLUDE_CUDA
 #include <optix.h>
 
 #define __KERNEL_GPU__
-#define __KERNEL_CUDA__  // OptiX kernels are implicitly CUDA kernels too
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
@@ -31,14 +30,14 @@
 #  define ATTR_FALLTHROUGH
 #endif
 
+/* Manual definitions so we can compile without CUDA toolkit. */
+
 #ifdef __CUDACC_RTC__
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -46,21 +45,6 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #define ccl_device \
   __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
@@ -69,29 +53,75 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
-#define ccl_loop_no_unroll
 #define ccl_restrict __restrict__
-#define ccl_ref
+#define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
 
-// Zero initialize structs to help the compiler figure out scoping
+/* Zero initialize structs to help the compiler figure out scoping */
 #define ccl_optional_struct_init = {}
 
-#define kernel_data __params.data  // See kernel_globals.h
-#define kernel_tex_array(t) __params.t
-#define kernel_tex_fetch(t, index) __params.t[(index)]
+/* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
 #include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPTIX_H__ */
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7d898ed5d91
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+  /* Kernel arguments */
+  const int *path_index_array;
+  float *render_buffer;
+
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+  /* Integrator state */
+  IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7f609eab474..c1e36febfc0 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -16,14 +16,20 @@
  */
 
 // clang-format off
-#include "kernel/kernel_compat_optix.h"
-#include "util/util_atomic.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "../cuda/kernel_cuda_image.h"  // Texture lookup uses normal CUDA intrinsics
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_bake.h"
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
 // clang-format on
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
@@ -53,52 +59,36 @@ template<bool always = false> ccl_device_forceinline uint get_object_id()
     return OBJECT_NONE;
 }
 
-extern "C" __global__ void __raygen__kernel_optix_path_trace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
-  KernelGlobals kg;  // Allocate stack storage for common data
-
-  const uint3 launch_index = optixGetLaunchIndex();
-  // Keep threads for same pixel together to improve occupancy of warps
-  uint pixel_offset = launch_index.x / __params.tile.num_samples;
-  uint sample_offset = launch_index.x % __params.tile.num_samples;
-
-  kernel_path_trace(&kg,
-                    __params.tile.buffer,
-                    __params.tile.start_sample + sample_offset,
-                    __params.tile.x + pixel_offset,
-                    __params.tile.y + launch_index.y,
-                    __params.tile.offset,
-                    __params.tile.stride);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_closest(nullptr, path_index);
 }
 
-#ifdef __BAKING__
-extern "C" __global__ void __raygen__kernel_optix_bake()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_bake_evaluate(&kg,
-                       p.input,
-                       p.output,
-                       (ShaderEvalType)p.type,
-                       p.filter,
-                       p.sx + optixGetLaunchIndex().x,
-                       p.offset,
-                       p.sample);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_shadow(nullptr, path_index);
 }
-#endif
 
-extern "C" __global__ void __raygen__kernel_optix_displace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_subsurface(nullptr, path_index);
 }
 
-extern "C" __global__ void __raygen__kernel_optix_background()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_volume_stack(nullptr, path_index);
 }
 
 extern "C" __global__ void __miss__kernel_optix_miss()
@@ -179,54 +169,91 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 {
 #ifdef __SHADOW_RECORD_ALL__
+  bool ignore_intersection = false;
+
   const uint prim = optixGetPrimitiveIndex();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
-    return optixIgnoreIntersection();
+    ignore_intersection = true;
   }
 #  endif
 
-  // Offset into array with num_hits
-  Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__prim_type, prim);
-
+  float u = 0.0f, v = 0.0f;
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
+    u = 1.0f - barycentrics.y - barycentrics.x;
+    v = barycentrics.x;
   }
 #  ifdef __HAIR__
   else {
-    const float u = __uint_as_float(optixGetAttribute_0());
-    isect->u = u;
-    isect->v = __uint_as_float(optixGetAttribute_1());
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
 
     // Filter out curve endcaps
     if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
+      ignore_intersection = true;
     }
   }
 #  endif
 
+  int num_hits = optixGetPayload_2();
+  int record_index = num_hits;
+  const int max_hits = optixGetPayload_3();
+
+  if (!ignore_intersection) {
+    optixSetPayload_2(num_hits + 1);
+  }
+
+  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+
 #  ifdef __TRANSPARENT_SHADOWS__
-  // Detect if this surface has a shader with transparent shadows
-  if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
+  if (num_hits >= max_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    const int num_recorded_hits = min(max_hits, num_hits);
+    float max_recorded_t = isect_array[0].t;
+    int max_recorded_hit = 0;
+
+    for (int i = 1; i < num_recorded_hits; i++) {
+      if (isect_array[i].t > max_recorded_t) {
+        max_recorded_t = isect_array[i].t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
+       * hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
 #  endif
-    // This is an opaque hit or the hit limit has been reached, abort traversal
-    optixSetPayload_5(true);
-    return optixTerminateRay();
+
+  if (!ignore_intersection) {
+    Intersection *const isect = isect_array + record_index;
+    isect->u = u;
+    isect->v = v;
+    isect->t = optixGetRayTmax();
+    isect->prim = prim;
+    isect->object = get_object_id();
+    isect->type = kernel_tex_fetch(__prim_type, prim);
+
+#  ifdef __TRANSPARENT_SHADOWS__
+    // Detect if this surface has a shader with transparent shadows
+    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
+#  endif
+      // If no transparent shadows, all light is blocked and we can stop immediately
+      optixSetPayload_5(true);
+      return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
+    }
+#  endif
   }
 
-  optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
-
   // Continue tracing
   optixIgnoreIntersection();
-#  endif
 #endif
 }
 
@@ -300,7 +327,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),   // Attribute_0
@@ -317,11 +344,4 @@ extern "C" __global__ void __intersection__curve_ribbon()
     optix_intersection_curve(prim, type);
   }
 }
-
-extern "C" __global__ void __intersection__curve_all()
-{
-  const uint prim = optixGetPrimitiveIndex();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
-  optix_intersection_curve(prim, type);
-}
 #endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
deleted file mode 100644
index b067e53a8bf..00000000000
--- a/intern/cycles/kernel/filter/filter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_H__
-#define __FILTER_H__
-
-/* CPU Filter Kernel Interface */
-
-#include "util/util_types.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
-#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
-#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
-
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-CCL_NAMESPACE_END
-
-#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
deleted file mode 100644
index 1c0ac5e2cb7..00000000000
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_DEFINES_H__
-#define __FILTER_DEFINES_H__
-
-#define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
-#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
-#define XTWY_SIZE (DENOISE_FEATURES + 1)
-
-#define DENOISE_MAX_FRAMES 16
-
-typedef struct TileInfo {
-  int offsets[9];
-  int strides[9];
-  int x[4];
-  int y[4];
-  int from_render;
-  int frames[DENOISE_MAX_FRAMES];
-  int num_frames;
-  /* TODO(lukas): CUDA doesn't have uint64_t... */
-#ifdef __KERNEL_OPENCL__
-  ccl_global float *buffers[9];
-#else
-  long long int buffers[9];
-#endif
-} TileInfo;
-
-#ifdef __KERNEL_OPENCL__
-#  define CCL_FILTER_TILE_INFO \
-    ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
-        ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
-        ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
-        ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
-        ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
-#  define CCL_FILTER_TILE_INFO_ARG \
-    tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
-        tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
-#  define ccl_get_tile_buffer(id) \
-    (id == 0 ? tile_buffer_1 : \
-     id == 1 ? tile_buffer_2 : \
-     id == 2 ? tile_buffer_3 : \
-     id == 3 ? tile_buffer_4 : \
-     id == 4 ? tile_buffer_5 : \
-     id == 5 ? tile_buffer_6 : \
-     id == 6 ? tile_buffer_7 : \
-     id == 7 ? tile_buffer_8 : \
-               tile_buffer_9)
-#else
-#  ifdef __KERNEL_CUDA__
-#    define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
-#  else
-#    define CCL_FILTER_TILE_INFO TileInfo *tile_info
-#  endif
-#  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
-#endif
-
-#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
deleted file mode 100644
index 8a2af957146..00000000000
--- a/intern/cycles/kernel/filter/filter_features.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always
- * points to the current pixel in the first pass. Repeat the loop for every secondary frame if
- * there are any. */
-#define FOR_PIXEL_WINDOW \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-
-#define END_FOR_PIXEL_WINDOW \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features(int3 pixel,
-                                           const ccl_global float *ccl_restrict buffer,
-                                           float *features,
-                                           bool use_time,
-                                           const float *ccl_restrict mean,
-                                           int pass_stride)
-{
-  features[0] = pixel.x;
-  features[1] = pixel.y;
-  features[2] = fabsf(ccl_get_feature(buffer, 0));
-  features[3] = ccl_get_feature(buffer, 1);
-  features[4] = ccl_get_feature(buffer, 2);
-  features[5] = ccl_get_feature(buffer, 3);
-  features[6] = ccl_get_feature(buffer, 4);
-  features[7] = ccl_get_feature(buffer, 5);
-  features[8] = ccl_get_feature(buffer, 6);
-  features[9] = ccl_get_feature(buffer, 7);
-  if (use_time) {
-    features[10] = pixel.z;
-  }
-  if (mean) {
-    for (int i = 0; i < (use_time ? 11 : 10); i++) {
-      features[i] -= mean[i];
-    }
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales(int3 pixel,
-                                                 const ccl_global float *ccl_restrict buffer,
-                                                 float *scales,
-                                                 bool use_time,
-                                                 const float *ccl_restrict mean,
-                                                 int pass_stride)
-{
-  scales[0] = fabsf(pixel.x - mean[0]);
-  scales[1] = fabsf(pixel.y - mean[1]);
-  scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
-  scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
-                                      ccl_get_feature(buffer, 2) - mean[4],
-                                      ccl_get_feature(buffer, 3) - mean[5]));
-  scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
-  scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
-                                      ccl_get_feature(buffer, 6) - mean[8],
-                                      ccl_get_feature(buffer, 7) - mean[9]));
-  if (use_time) {
-    scales[6] = fabsf(pixel.z - mean[10]);
-  }
-}
-
-ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
-{
-  scale[0] = 1.0f / max(scale[0], 0.01f);
-  scale[1] = 1.0f / max(scale[1], 0.01f);
-  scale[2] = 1.0f / max(scale[2], 0.01f);
-  if (use_time) {
-    scale[10] = 1.0f / max(scale[6], 0.01f);
-  }
-  scale[6] = 1.0f / max(scale[4], 0.01f);
-  scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
-  scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
-}
-
-ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
-                                          int pass_stride)
-{
-  return make_float3(
-      ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
-}
-
-ccl_device_inline void design_row_add(float *design_row,
-                                      int rank,
-                                      const ccl_global float *ccl_restrict transform,
-                                      int stride,
-                                      int row,
-                                      float feature,
-                                      int transform_row_stride)
-{
-  for (int i = 0; i < rank; i++) {
-    design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
-  }
-}
-
-/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(
-    int3 p_pixel,
-    const ccl_global float *ccl_restrict p_buffer,
-    int3 q_pixel,
-    const ccl_global float *ccl_restrict q_buffer,
-    int pass_stride,
-    int rank,
-    float *design_row,
-    const ccl_global float *ccl_restrict transform,
-    int stride,
-    bool use_time)
-{
-  int num_features = use_time ? 11 : 10;
-
-  design_row[0] = 1.0f;
-  math_vector_zero(design_row + 1, rank);
-
-#define DESIGN_ROW_ADD(I, F) \
-  design_row_add(design_row, rank, transform, stride, I, F, num_features);
-  DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
-  DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
-  DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-  DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
-  DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
-  DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
-  DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
-  DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
-  DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
-  DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
-  if (use_time) {
-    DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
-  }
-#undef DESIGN_ROW_ADD
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
deleted file mode 100644
index 59d4ace2bef..00000000000
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
- * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set
- * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    float4 t4 = make_float4(pixel.z); \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      float4 y4 = make_float4(pixel.y); \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-        float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-        int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features_sse(float4 x,
-                                               float4 y,
-                                               float4 t,
-                                               int4 active_pixels,
-                                               const float *ccl_restrict buffer,
-                                               float4 *features,
-                                               bool use_time,
-                                               const float4 *ccl_restrict mean,
-                                               int pass_stride)
-{
-  int num_features = use_time ? 11 : 10;
-
-  features[0] = x;
-  features[1] = y;
-  features[2] = fabs(ccl_get_feature_sse(0));
-  features[3] = ccl_get_feature_sse(1);
-  features[4] = ccl_get_feature_sse(2);
-  features[5] = ccl_get_feature_sse(3);
-  features[6] = ccl_get_feature_sse(4);
-  features[7] = ccl_get_feature_sse(5);
-  features[8] = ccl_get_feature_sse(6);
-  features[9] = ccl_get_feature_sse(7);
-  if (use_time) {
-    features[10] = t;
-  }
-
-  if (mean) {
-    for (int i = 0; i < num_features; i++) {
-      features[i] = features[i] - mean[i];
-    }
-  }
-  for (int i = 0; i < num_features; i++) {
-    features[i] = mask(active_pixels, features[i]);
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales_sse(float4 x,
-                                                     float4 y,
-                                                     float4 t,
-                                                     int4 active_pixels,
-                                                     const float *ccl_restrict buffer,
-                                                     float4 *scales,
-                                                     bool use_time,
-                                                     const float4 *ccl_restrict mean,
-                                                     int pass_stride)
-{
-  scales[0] = fabs(x - mean[0]);
-  scales[1] = fabs(y - mean[1]);
-  scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
-  scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
-              sqr(ccl_get_feature_sse(3) - mean[5]);
-  scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
-  scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
-              sqr(ccl_get_feature_sse(7) - mean[9]);
-  if (use_time) {
-    scales[6] = fabs(t - mean[10]);
-  }
-
-  for (int i = 0; i < (use_time ? 7 : 6); i++)
-    scales[i] = mask(active_pixels, scales[i]);
-}
-
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
-{
-  scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
-  scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
-  scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
-  if (use_time) {
-    scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
-  }
-  scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
-  scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
-  scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
deleted file mode 100644
index 2ef03dc0a02..00000000000
--- a/intern/cycles/kernel/filter/filter_kernel.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_color.h"
-#include "util/util_math.h"
-#include "util/util_math_fast.h"
-#include "util/util_texture.h"
-
-#include "util/util_atomic.h"
-#include "util/util_math_matrix.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "kernel/filter/filter_features.h"
-#ifdef __KERNEL_SSE3__
-#  include "kernel/filter/filter_features_sse.h"
-#endif
-
-#include "kernel/filter/filter_prefilter.h"
-
-#ifdef __KERNEL_GPU__
-#  include "kernel/filter/filter_transform_gpu.h"
-#else
-#  ifdef __KERNEL_SSE3__
-#    include "kernel/filter/filter_transform_sse.h"
-#  else
-#    include "kernel/filter/filter_transform.h"
-#  endif
-#endif
-
-#include "kernel/filter/filter_reconstruction.h"
-
-#ifdef __KERNEL_CPU__
-#  include "kernel/filter/filter_nlm_cpu.h"
-#else
-#  include "kernel/filter/filter_nlm_gpu.h"
-#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
deleted file mode 100644
index 24200c29203..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf) + (ofs))
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
-                                                         int dy,
-                                                         const float *ccl_restrict weight_image,
-                                                         const float *ccl_restrict variance_image,
-                                                         const float *ccl_restrict scale_image,
-                                                         float *difference_image,
-                                                         int4 rect,
-                                                         int stride,
-                                                         int channel_offset,
-                                                         int frame_offset,
-                                                         float a,
-                                                         float k_2)
-{
-  /* Strides need to be aligned to 16 bytes. */
-  kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
-  int aligned_lowx = rect.x & (~3);
-  const int numChannels = (channel_offset > 0) ? 3 : 1;
-  const float4 channel_fac = make_float4(1.0f / numChannels);
-
-  for (int y = rect.y; y < rect.w; y++) {
-    int idx_p = y * stride + aligned_lowx;
-    int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
-    for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
-      float4 diff = make_float4(0.0f);
-      float4 scale_fac;
-      if (scale_image) {
-        scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
-                          make_float4(0.25f),
-                          make_float4(4.0f));
-      }
-      else {
-        scale_fac = make_float4(1.0f);
-      }
-      for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
-        /* idx_p is guaranteed to be aligned, but idx_q isn't. */
-        float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
-        float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
-        float4 cdiff = color_p - color_q;
-        float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
-        float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
-        diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
-                (make_float4(1e-8f) + k_2 * (var_p + var_q));
-      }
-      load4_a(difference_image, idx_p) = diff * channel_fac;
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    const int low = max(rect.y, y - f);
-    const int high = min(rect.w, y + f + 1);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-    for (int y1 = low; y1 < high; y1++) {
-      for (int x = aligned_lowx; x < rect.z; x += 4) {
-        load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
-      }
-    }
-    float fac = 1.0f / (high - low);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) *= fac;
-    }
-  }
-}
-
-ccl_device_inline void nlm_blur_horizontal(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-  }
-
-  for (int dx = -f; dx <= f; dx++) {
-    aligned_lowx = round_down(rect.x - min(0, dx), 4);
-    int highx = rect.z - max(0, dx);
-    int4 lowx4 = make_int4(rect.x - min(0, dx));
-    int4 highx4 = make_int4(rect.z - max(0, dx));
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = aligned_lowx; x < highx; x += 4) {
-        int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-        int4 active = (x4 >= lowx4) & (x4 < highx4);
-
-        float4 diff = load4_u(difference_image, y * stride + x + dx);
-        load4_a(out_image, y * stride + x) += mask(active, diff);
-      }
-    }
-  }
-
-  aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
-      float4 low = max(make_float4(rect.x), x4 - make_float4(f));
-      float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
-      load4_a(out_image, y * stride + x) *= rcp(high - low);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = fast_expf4(
-          -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int dx,
-                                                       int dy,
-                                                       const float *ccl_restrict difference_image,
-                                                       const float *ccl_restrict image,
-                                                       float *temp_image,
-                                                       float *out_image,
-                                                       float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-      int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
-
-      int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-
-      float4 weight = load4_a(temp_image, idx_p);
-      load4_a(accum_image, idx_p) += mask(active, weight);
-
-      float4 val = load4_u(image, idx_q);
-      if (channel_offset) {
-        val += load4_u(image, idx_q + channel_offset);
-        val += load4_u(image, idx_q + 2 * channel_offset);
-        val *= 1.0f / 3.0f;
-      }
-
-      load4_a(out_image, idx_p) += mask(active, weight * val);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
-                                                           int dy,
-                                                           int t,
-                                                           const float *ccl_restrict
-                                                               difference_image,
-                                                           const float *ccl_restrict buffer,
-                                                           float *transform,
-                                                           int *rank,
-                                                           float *XtWX,
-                                                           float3 *XtWY,
-                                                           int4 rect,
-                                                           int4 filter_window,
-                                                           int stride,
-                                                           int f,
-                                                           int pass_stride,
-                                                           int frame_offset,
-                                                           bool use_time)
-{
-  int4 clip_area = rect_clip(rect, filter_window);
-  /* fy and fy are in filter-window-relative coordinates,
-   * while x and y are in feature-window-relative coordinates. */
-  for (int y = clip_area.y; y < clip_area.w; y++) {
-    for (int x = clip_area.x; x < clip_area.z; x++) {
-      const int low = max(rect.x, x - f);
-      const int high = min(rect.z, x + f + 1);
-      float sum = 0.0f;
-      for (int x1 = low; x1 < high; x1++) {
-        sum += difference_image[y * stride + x1];
-      }
-      float weight = sum * (1.0f / (high - low));
-
-      int storage_ofs = coord_to_local_index(filter_window, x, y);
-      float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
-      float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
-      float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
-      int *l_rank = rank + storage_ofs;
-
-      kernel_filter_construct_gramian(x,
-                                      y,
-                                      1,
-                                      dx,
-                                      dy,
-                                      t,
-                                      stride,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time,
-                                      buffer,
-                                      l_transform,
-                                      l_rank,
-                                      weight,
-                                      l_XtWX,
-                                      l_XtWY,
-                                      0);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
-                                                   const float *ccl_restrict accum_image,
-                                                   int4 rect,
-                                                   int w)
-{
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = rect.x; x < rect.z; x++) {
-      out_image[y * w + x] /= accum_image[y * w + x];
-    }
-  }
-}
-
-#undef load4_a
-#undef load4_u
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
deleted file mode 100644
index 650c743f34f..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Determines pixel coordinates and offset for the current thread.
- * Returns whether the thread should do any work.
- *
- * All coordinates are relative to the denoising buffer!
- *
- * Window is the rect that should be processed.
- * co is filled with (x, y, dx, dy).
- */
-ccl_device_inline bool get_nlm_coords_window(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
-{
-  /* Determine the pixel offset that this thread should apply. */
-  int s = 2 * r + 1;
-  int si = ccl_global_id(1);
-  int sx = si % s;
-  int sy = si / s;
-  if (sy >= s) {
-    return false;
-  }
-
-  /* Pixels still need to lie inside the denoising buffer after applying the offset,
-   * so determine the area for which this is the case. */
-  int dx = sx - r;
-  int dy = sy - r;
-
-  *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
-
-  /* Find the intersection of the area that we want to process (window) and the area
-   * that can be processed (rect) to get the final area for this offset. */
-  int4 clip_area = rect_clip(window, *rect);
-
-  /* If the radius is larger than one of the sides of the window,
-   * there will be shifts for which there is no usable pixel at all. */
-  if (!rect_is_valid(clip_area)) {
-    return false;
-  }
-
-  /* Map the linear thread index to pixels inside the clip area. */
-  int x, y;
-  if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
-    return false;
-  }
-
-  *co = make_int4(x, y, dx, dy);
-
-  *ofs = (sy * s + sx) * stride;
-
-  return true;
-}
-
-ccl_device_inline bool get_nlm_coords(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
-{
-  return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    const ccl_global float *ccl_restrict weight_image,
-    const ccl_global float *ccl_restrict variance_image,
-    const ccl_global float *ccl_restrict scale_image,
-    ccl_global float *difference_image,
-    int4 rect,
-    int stride,
-    int channel_offset,
-    int frame_offset,
-    float a,
-    float k_2)
-{
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
-  int numChannels = channel_offset ? 3 : 1;
-
-  float diff = 0.0f;
-  float scale_fac = 1.0f;
-  if (scale_image) {
-    scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
-  }
-
-  for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
-    float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
-    float pvar = variance_image[idx_p];
-    float qvar = sqr(scale_fac) * variance_image[idx_q];
-    diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
-  }
-  if (numChannels > 1) {
-    diff *= 1.0f / numChannels;
-  }
-  difference_image[y * stride + x] = diff;
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(int x,
-                                              int y,
-                                              const ccl_global float *ccl_restrict
-                                                  difference_image,
-                                              ccl_global float *out_image,
-                                              int4 rect,
-                                              int stride,
-                                              int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.y, y - f);
-  const int high = min(rect.w, y + f + 1);
-  for (int y1 = low; y1 < high; y1++) {
-    sum += difference_image[y1 * stride + x];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = sum;
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
-                                                     int y,
-                                                     const ccl_global float *ccl_restrict
-                                                         difference_image,
-                                                     ccl_global float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int x,
-                                                       int y,
-                                                       int dx,
-                                                       int dy,
-                                                       const ccl_global float *ccl_restrict
-                                                           difference_image,
-                                                       const ccl_global float *ccl_restrict image,
-                                                       ccl_global float *out_image,
-                                                       ccl_global float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-  if (out_image) {
-    atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
-    float val = image[idx_q];
-    if (channel_offset) {
-      val += image[idx_q + channel_offset];
-      val += image[idx_q + 2 * channel_offset];
-      val *= 1.0f / 3.0f;
-    }
-    atomic_add_and_fetch_float(out_image + idx_p, sum * val);
-  }
-  else {
-    accum_image[idx_p] = sum;
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    int t,
-    const ccl_global float *ccl_restrict difference_image,
-    const ccl_global float *ccl_restrict buffer,
-    const ccl_global float *ccl_restrict transform,
-    ccl_global int *rank,
-    ccl_global float *XtWX,
-    ccl_global float3 *XtWY,
-    int4 rect,
-    int4 filter_window,
-    int stride,
-    int f,
-    int pass_stride,
-    int frame_offset,
-    bool use_time,
-    int localIdx)
-{
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  float sum = 0.0f;
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  float weight = sum * (1.0f / (high - low));
-
-  /* Reconstruction data is only stored for pixels inside the filter window,
-   * so compute the pixels's index in there. */
-  int storage_ofs = coord_to_local_index(filter_window, x, y);
-  transform += storage_ofs;
-  rank += storage_ofs;
-  XtWX += storage_ofs;
-  XtWY += storage_ofs;
-
-  kernel_filter_construct_gramian(x,
-                                  y,
-                                  rect_size(filter_window),
-                                  dx,
-                                  dy,
-                                  t,
-                                  stride,
-                                  pass_stride,
-                                  frame_offset,
-                                  use_time,
-                                  buffer,
-                                  transform,
-                                  rank,
-                                  weight,
-                                  XtWX,
-                                  XtWY,
-                                  localIdx);
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(int x,
-                                                   int y,
-                                                   ccl_global float *out_image,
-                                                   const ccl_global float *ccl_restrict
-                                                       accum_image,
-                                                   int stride)
-{
-  out_image[y * stride + x] /= accum_image[y * stride + x];
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
deleted file mode 100644
index 97cecba190e..00000000000
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/**
- * First step of the shadow prefiltering, performs the shadow division and stores all data
- * in a nice and easy rectangular array that can be passed to the NLM filter.
- *
- * Calculates:
- * \param unfiltered: Contains the two half images of the shadow feature pass
- * \param sampleVariance: The sample-based variance calculated in the kernel.
- * Note: This calculation is biased in general,
- * and especially here since the variance of the ratio can only be approximated.
- * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy
- * (since it's essentially the buffer variance of the two variance halves)
- * \param bufferVariance: The buffer-based variance of the shadow feature.
- * Unbiased, but quite noisy.
- */
-ccl_device void kernel_filter_divide_shadow(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int x,
-                                            int y,
-                                            ccl_global float *unfilteredA,
-                                            ccl_global float *unfilteredB,
-                                            ccl_global float *sampleVariance,
-                                            ccl_global float *sampleVarianceV,
-                                            ccl_global float *bufferVariance,
-                                            int4 rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-
-  int offset = tile_info->offsets[tile];
-  int stride = tile_info->strides[tile];
-  const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
-      tile);
-  center_buffer += (y * stride + x + offset) * buffer_pass_stride;
-  center_buffer += buffer_denoising_offset + 14;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-  unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
-  unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
-
-  float varA = center_buffer[2];
-  float varB = center_buffer[5];
-  int odd_sample = (sample + 1) / 2;
-  int even_sample = sample / 2;
-
-  /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-   * update does not work efficiently with atomics in the kernel. */
-  varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
-  varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
-
-  varA /= max(odd_sample - 1, 1);
-  varB /= max(even_sample - 1, 1);
-
-  sampleVariance[idx] = 0.5f * (varA + varB) / sample;
-  sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
-  bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
-                        (unfilteredA[idx] - unfilteredB[idx]);
-}
-
-/* Load a regular feature from the render buffers into the denoise buffer.
- * Parameters:
- * - sample: The sample amount in the buffer, used to normalize the buffer.
- * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
- * - x, y: Current pixel
- * - mean, variance: Target denoise buffers.
- * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
- */
-ccl_device void kernel_filter_get_feature(int sample,
-                                          CCL_FILTER_TILE_INFO,
-                                          int m_offset,
-                                          int v_offset,
-                                          int x,
-                                          int y,
-                                          ccl_global float *mean,
-                                          ccl_global float *variance,
-                                          float scale,
-                                          int4 rect,
-                                          int buffer_pass_stride,
-                                          int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-  ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
-                                    (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
-                                        buffer_pass_stride +
-                                    buffer_denoising_offset;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float val = scale * center_buffer[m_offset];
-  mean[idx] = val;
-
-  if (v_offset >= 0) {
-    if (sample > 1) {
-      /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-       * update does not work efficiently with atomics in the kernel. */
-      variance[idx] = max(
-          0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
-    }
-    else {
-      /* Can't compute variance with single sample, just set it very high. */
-      variance[idx] = 1e10f;
-    }
-  }
-}
-
-ccl_device void kernel_filter_write_feature(int sample,
-                                            int x,
-                                            int y,
-                                            int4 buffer_params,
-                                            ccl_global float *from,
-                                            ccl_global float *buffer,
-                                            int out_offset,
-                                            int4 rect)
-{
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  combined_buffer[out_offset] = from[idx];
-}
-
-#define GET_COLOR(image) \
-  make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride])
-#define SET_COLOR(image, color) \
-  image[idx] = color.x; \
-  image[idx + pass_stride] = color.y; \
-  image[idx + 2 * pass_stride] = color.z
-
-ccl_device void kernel_filter_detect_outliers(int x,
-                                              int y,
-                                              ccl_global float *in,
-                                              ccl_global float *variance_out,
-                                              ccl_global float *depth,
-                                              ccl_global float *image_out,
-                                              int4 rect,
-                                              int pass_stride)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  ccl_global float *image_in = in;
-  ccl_global float *variance_in = in + 3 * pass_stride;
-
-  int n = 0;
-  float values[25];
-  float pixel_variance, max_variance = 0.0f;
-  for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
-    for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
-      int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
-      float3 color = GET_COLOR(image_in);
-      color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-      float L = average(color);
-
-      /* Find the position of L. */
-      int i;
-      for (i = 0; i < n; i++) {
-        if (values[i] > L)
-          break;
-      }
-      /* Make space for L by shifting all following values to the right. */
-      for (int j = n; j > i; j--) {
-        values[j] = values[j - 1];
-      }
-      /* Insert L. */
-      values[i] = L;
-      n++;
-
-      float3 pixel_var = GET_COLOR(variance_in);
-      float var = average(pixel_var);
-      if ((x1 == x) && (y1 == y)) {
-        pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
-                                                                                            var;
-      }
-      else {
-        max_variance = max(max_variance, var);
-      }
-    }
-  }
-
-  max_variance += 1e-4f;
-
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float3 color = GET_COLOR(image_in);
-  float3 variance = GET_COLOR(variance_in);
-  color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-  variance = max(variance, make_float3(0.0f, 0.0f, 0.0f));
-
-  float L = average(color);
-
-  float ref = 2.0f * values[(int)(n * 0.75f)];
-
-  /* Slightly offset values to avoid false positives in (almost) black areas. */
-  max_variance += 1e-5f;
-  ref -= 1e-5f;
-
-  if (L > ref) {
-    /* The pixel appears to be an outlier.
-     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is
-     * that the pixel should actually be at the reference value: If the reference is within the
-     * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very
-     * unlikely that the pixel should be darker, which indicates a legitimate highlight.
-     */
-
-    if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
-      depth[idx] = -depth[idx];
-      color *= ref / L;
-      variance = make_float3(max_variance, max_variance, max_variance);
-    }
-    else {
-      float stddev = sqrtf(pixel_variance);
-      if (L - 3 * stddev < ref) {
-        /* The pixel is an outlier, so negate the depth value to mark it as one.
-         * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM
-         * weights. */
-        depth[idx] = -depth[idx];
-        float fac = ref / L;
-        color *= fac;
-        variance *= sqr(fac);
-      }
-    }
-  }
-
-  /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results.
-   * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we
-   * scale by the square of that (since we have variance instead of standard deviation). */
-  color = color_highlight_compress(color, &variance);
-
-  SET_COLOR(image_out, color);
-  SET_COLOR(variance_out, variance);
-}
-
-#undef GET_COLOR
-#undef SET_COLOR
-
-/* Combine A/B buffers.
- * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x,
-                                             int y,
-                                             ccl_global float *mean,
-                                             ccl_global float *variance,
-                                             ccl_global float *a,
-                                             ccl_global float *b,
-                                             int4 rect,
-                                             int r)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  if (mean)
-    mean[idx] = 0.5f * (a[idx] + b[idx]);
-  if (variance) {
-    if (r == 0)
-      variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
-    else {
-      variance[idx] = 0.0f;
-      float values[25];
-      int numValues = 0;
-      for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
-        for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
-          int pidx = (py - rect.y) * buffer_w + (px - rect.x);
-          values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
-        }
-      }
-      /* Insertion-sort the variances (fast enough for 25 elements). */
-      for (int i = 1; i < numValues; i++) {
-        float v = values[i];
-        int j;
-        for (j = i - 1; j >= 0 && values[j] > v; j--)
-          values[j + 1] = values[j];
-        values[j + 1] = v;
-      }
-      variance[idx] = values[(7 * numValues) / 8];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
deleted file mode 100644
index 17941689ad5..00000000000
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_filter_construct_gramian(int x,
-                                                       int y,
-                                                       int storage_stride,
-                                                       int dx,
-                                                       int dy,
-                                                       int t,
-                                                       int buffer_stride,
-                                                       int pass_stride,
-                                                       int frame_offset,
-                                                       bool use_time,
-                                                       const ccl_global float *ccl_restrict buffer,
-                                                       const ccl_global float *ccl_restrict
-                                                           transform,
-                                                       ccl_global int *rank,
-                                                       float weight,
-                                                       ccl_global float *XtWX,
-                                                       ccl_global float3 *XtWY,
-                                                       int localIdx)
-{
-  if (weight < 1e-3f) {
-    return;
-  }
-
-  int p_offset = y * buffer_stride + x;
-  int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
-
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
-#else
-  float design_row[DENOISE_FEATURES + 1];
-#endif
-
-  float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
-
-  /* If the pixel was flagged as an outlier during prefiltering, skip it. */
-  if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
-    return;
-  }
-
-  filter_get_design_row_transform(make_int3(x, y, t),
-                                  buffer + p_offset,
-                                  make_int3(x + dx, y + dy, t),
-                                  buffer + q_offset,
-                                  pass_stride,
-                                  *rank,
-                                  design_row,
-                                  transform,
-                                  stride,
-                                  use_time);
-
-#ifdef __KERNEL_GPU__
-  math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
-  math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
-#else
-  math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
-  math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
-#endif
-}
-
-ccl_device_inline void kernel_filter_finalize(int x,
-                                              int y,
-                                              ccl_global float *buffer,
-                                              ccl_global int *rank,
-                                              int storage_stride,
-                                              ccl_global float *XtWX,
-                                              ccl_global float3 *XtWY,
-                                              int4 buffer_params,
-                                              int sample)
-{
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-  if (XtWX[0] < 1e-3f) {
-    /* There is not enough information to determine a denoised result.
-     * As a fallback, keep the original value of the pixel. */
-    return;
-  }
-
-  /* The weighted average of pixel colors (essentially, the NLM-filtered image).
-   * In case the solution of the linear model fails due to numerical issues or
-   * returns nonsensical negative values, fall back to this value. */
-  float3 mean_color = XtWY[0] / XtWX[0];
-
-  math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
-
-  float3 final_color = XtWY[0];
-  if (!isfinite3_safe(final_color) ||
-      (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
-    final_color = mean_color;
-  }
-
-  /* Clamp pixel value to positive values and reverse the highlight compression transform. */
-  final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));
-
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-  if (buffer_params.w >= 0) {
-    final_color *= sample;
-    if (buffer_params.w > 0) {
-      final_color.x += combined_buffer[buffer_params.w + 0];
-      final_color.y += combined_buffer[buffer_params.w + 1];
-      final_color.z += combined_buffer[buffer_params.w + 2];
-    }
-  }
-  combined_buffer[0] = final_color.x;
-  combined_buffer[1] = final_color.y;
-  combined_buffer[2] = final_color.z;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
deleted file mode 100644
index 880a661214e..00000000000
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float features[DENOISE_FEATURES];
-
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < (*rank); i++) {
-    math_vector_mul(transform + i * num_features, feature_scale, num_features);
-  }
-  math_matrix_transpose(transform, num_features, 1);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
deleted file mode 100644
index ec258a5212a..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  ccl_global float *transform,
-                                                  ccl_global int *rank,
-                                                  int radius,
-                                                  float pca_threshold,
-                                                  int transform_stride,
-                                                  int localIdx)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
-#else
-  float features[DENOISE_FEATURES];
-#endif
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-  const ccl_global float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, transform_stride);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    for (int j = 0; j < (*rank); j++) {
-      transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
deleted file mode 100644
index 0304d990f9f..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float4 features[DENOISE_FEATURES];
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float4 feature_means[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_means, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add_sse(feature_means, num_features, features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float4 pixel_scale = make_float4(1.0f / num_pixels);
-  for (int i = 0; i < num_features; i++) {
-    feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
-  }
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float4 feature_scale[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_scale, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_feature_scales_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max_sse(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  filter_calculate_scale_sse(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero_sse(feature_matrix_sse, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul_sse(features, num_features, feature_scale);
-    math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, 1);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 5ff4d5f7053..4de824cc277 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/geom/geom_attribute.h"
 #include "kernel/geom/geom_object.h"
@@ -31,4 +33,5 @@
 #include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
+#include "kernel/geom/geom_shader_data.h"
 // clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index b37797ac21b..9532a21fec7 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Attributes
@@ -25,9 +27,9 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd);
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd);
 
-ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint attribute_primitive_type(const KernelGlobals *kg, const ShaderData *sd)
 {
   if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
     return ATTR_PRIM_SUBD;
@@ -46,12 +48,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 /* Find attribute based on ID */
 
-ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_attribute_map_offset(const KernelGlobals *kg, int object)
 {
   return kernel_tex_fetch(__objects, object).attribute_map_offset;
 }
 
-ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
+ccl_device_inline AttributeDescriptor find_attribute(const KernelGlobals *kg,
                                                      const ShaderData *sd,
                                                      uint id)
 {
@@ -98,7 +100,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
 
 /* Transform matrix attribute on meshes */
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg,
+ccl_device Transform primitive_attribute_matrix(const KernelGlobals *kg,
                                                 const ShaderData *sd,
                                                 const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b5a62a31ca9..a827a67ce7a 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
@@ -25,8 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Reading attributes on various curve elements */
 
-ccl_device float curve_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float curve_attribute_float(const KernelGlobals *kg,
+                                       const ShaderData *sd,
+                                       const AttributeDescriptor desc,
+                                       float *dx,
+                                       float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
     float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
@@ -64,7 +69,7 @@ ccl_device float curve_attribute_float(
   }
 }
 
-ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
+ccl_device float2 curve_attribute_float2(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float2 *dx,
@@ -110,7 +115,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
+ccl_device float3 curve_attribute_float3(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float3 *dx,
@@ -152,7 +157,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+ccl_device float4 curve_attribute_float4(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float4 *dx,
@@ -196,7 +201,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
 
 /* Curve thickness */
 
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+ccl_device float curve_thickness(const KernelGlobals *kg, const ShaderData *sd)
 {
   float r = 0.0f;
 
@@ -224,7 +229,7 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 /* Curve location for motion pass, linear interpolation between keys and
  * ignoring radius because we do the same for the motion keys */
 
-ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_motion_center_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
   int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -240,7 +245,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 
 /* Curve tangent normal */
 
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_tangent_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 tgN = make_float3(0.0f, 0.0f, 0.0f);
 
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index e25bf5b4660..213f3e62ee0 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve primitive intersection functions.
@@ -167,6 +169,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
 }
 
 ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+                                          float *ray_tfar,
                                           const float dt,
                                           const float4 curve[4],
                                           float u,
@@ -230,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
     if (fabsf(f) < f_err && fabsf(g) < g_err) {
       t += dt;
-      if (!(0.0f <= t && t <= isect->t)) {
+      if (!(0.0f <= t && t <= *ray_tfar)) {
         return false; /* Rejects NaNs */
       }
       if (!(u >= 0.0f && u <= 1.0f)) {
@@ -247,6 +250,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
       }
 
       /* Record intersection. */
+      *ray_tfar = t;
       isect->t = t;
       isect->u = u;
       isect->v = 0.0f;
@@ -259,6 +263,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
 ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           const float3 ray_dir,
+                                          float ray_tfar,
                                           float4 curve[4],
                                           Intersection *isect)
 {
@@ -339,7 +344,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       }
 
       /* Intersect with cap-planes. */
-      float2 tp = make_float2(-dt, isect->t - dt);
+      float2 tp = make_float2(-dt, ray_tfar - dt);
       tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
       const float2 h0 = half_plane_intersect(
           float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
@@ -402,19 +407,19 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
         else {
           recurse = true;
         }
       }
 
-      if (valid1 && (tp1.x + dt <= isect->t)) {
+      if (valid1 && (tp1.x + dt <= ray_tfar)) {
         const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
         }
         else {
           recurse = true;
@@ -542,7 +547,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
 
 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
                                         const float3 ray_dir,
-                                        const float ray_tfar,
+                                        float ray_tfar,
                                         const int N,
                                         float4 curve[4],
                                         Intersection *isect)
@@ -590,7 +595,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
       /* Intersect quad. */
       float vu, vv, vt;
-      bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+      bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
 
       if (valid0) {
         /* ignore self intersections */
@@ -604,6 +609,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
           vv = 2.0f * vv - 1.0f;
 
           /* Record intersection. */
+          ray_tfar = vt;
           isect->t = vt;
           isect->u = u + vu * step_size;
           isect->v = vv;
@@ -619,10 +625,11 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
   return false;
 }
 
-ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
+ccl_device_forceinline bool curve_intersect(const KernelGlobals *kg,
                                             Intersection *isect,
                                             const float3 P,
                                             const float3 dir,
+                                            const float tmax,
                                             uint visibility,
                                             int object,
                                             int curveAddr,
@@ -672,7 +679,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* todo: adaptive number of subdivisions could help performance here. */
     const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -682,7 +689,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
     return false;
   }
   else {
-    if (curve_intersect_recursive(P, dir, curve, isect)) {
+    if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -693,28 +700,23 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
                                           ShaderData *sd,
-                                          const Intersection *isect,
-                                          const Ray *ray)
+                                          float3 P,
+                                          float3 D,
+                                          float t,
+                                          const int isect_object,
+                                          const int isect_prim)
 {
-  float t = isect->t;
-  float3 P = ray->P;
-  float3 D = ray->D;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
     D = normalize_len(D, &t);
   }
 
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  int prim = kernel_tex_fetch(__prim_index, isect_prim);
   float4 v00 = kernel_tex_fetch(__curves, prim);
 
   int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -735,23 +737,20 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
   }
 
-  sd->u = isect->u;
-
   P = P + D * t;
 
-  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
   const float3 dPdu = float4_to_float3(dPdu4);
 
   if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
     const float3 tangent = normalize(dPdu);
     const float3 bitangent = normalize(cross(tangent, -D));
-    const float sine = isect->v;
+    const float sine = sd->v;
     const float cosine = safe_sqrtf(1.0f - sine * sine);
 
     sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
     sd->Ng = -D;
-    sd->v = isect->v;
 
 #  if 0
     /* This approximates the position and geometric normal of a thick curve too,
@@ -765,7 +764,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     /* Thick curves, compute normal using direction from inside the curve.
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
-    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
     const float3 Ng = normalize(P - P_inside);
 
     sd->N = Ng;
@@ -779,13 +778,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
   sd->dPdv = cross(dPdu, sd->Ng);
 #  endif
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 0f66f4af755..5294da03145 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Motion Curve Primitive
@@ -25,7 +27,7 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
-ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_curve_motion(const KernelGlobals *kg,
                                                   int object,
                                                   uint id,
                                                   AttributeElement *elem)
@@ -50,7 +52,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step_linear(const KernelGlobals *kg,
                                                          int offset,
                                                          int numkeys,
                                                          int numsteps,
@@ -78,7 +80,7 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
 
 /* return 2 curve key locations */
 ccl_device_inline void motion_curve_keys_linear(
-    KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
+    const KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
 {
   /* get motion info */
   int numsteps, numkeys;
@@ -105,7 +107,7 @@ ccl_device_inline void motion_curve_keys_linear(
   keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
 }
 
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step(const KernelGlobals *kg,
                                                   int offset,
                                                   int numkeys,
                                                   int numsteps,
@@ -138,7 +140,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys(const KernelGlobals *kg,
                                          int object,
                                          int prim,
                                          float time,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 53d6b92dd7e..eb4a39e062b 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -25,11 +25,13 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Time interpolation of vertex positions and normals */
 
-ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
                                             int object,
                                             uint id,
                                             AttributeElement *elem)
@@ -49,7 +51,7 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_verts_for_step(const KernelGlobals *kg,
                                                       uint4 tri_vindex,
                                                       int offset,
                                                       int numverts,
@@ -76,7 +78,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_normals_for_step(const KernelGlobals *kg,
                                                         uint4 tri_vindex,
                                                         int offset,
                                                         int numverts,
@@ -104,7 +106,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
 }
 
 ccl_device_inline void motion_triangle_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
 {
   /* get motion info */
   int numsteps, numverts;
@@ -134,7 +136,7 @@ ccl_device_inline void motion_triangle_vertices(
 }
 
 ccl_device_inline float3 motion_triangle_smooth_normal(
-    KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
+    const KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
 {
   /* get motion info */
   int numsteps, numverts;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 859d919f0bb..ec7e4b07d76 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
@@ -32,23 +34,21 @@ CCL_NAMESPACE_BEGIN
  * a closer distance.
  */
 
-ccl_device_inline float3 motion_triangle_refine(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+ccl_device_inline float3 motion_triangle_refine(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float3 P,
+                                                float3 D,
+                                                float t,
+                                                const int isect_object,
+                                                const int isect_prim,
+                                                float3 verts[3])
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -70,13 +70,8 @@ ccl_device_inline float3 motion_triangle_refine(
   /* Compute refined position. */
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -86,7 +81,7 @@ ccl_device_inline float3 motion_triangle_refine(
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space
+/* Same as above, except that t is assumed to be in object space
  * for instancing.
  */
 
@@ -97,27 +92,22 @@ ccl_device_noinline
 ccl_device_inline
 #  endif
     float3
-    motion_triangle_refine_local(KernelGlobals *kg,
+    motion_triangle_refine_local(const KernelGlobals *kg,
                                  ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray,
+                                 float3 P,
+                                 float3 D,
+                                 float t,
+                                 const int isect_object,
+                                 const int isect_prim,
                                  float3 verts[3])
 {
 #  ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return motion_triangle_refine(kg, sd, isect, ray, verts);
+  /* t is always in world space with OptiX. */
+  return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts);
 #  else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #    ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#      endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -138,13 +128,8 @@ ccl_device_inline
 
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#      endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -160,10 +145,11 @@ ccl_device_inline
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect(const KernelGlobals *kg,
                                                  Intersection *isect,
                                                  float3 P,
                                                  float3 dir,
+                                                 float tmax,
                                                  float time,
                                                  uint visibility,
                                                  int object,
@@ -179,7 +165,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              (ssef *)verts,
 #else
@@ -215,7 +201,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
  * Returns whether traversal should be stopped.
  */
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect_local(const KernelGlobals *kg,
                                                        LocalIntersection *local_isect,
                                                        float3 P,
                                                        float3 dir,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 7a91f8041f7..85c4f0ca522 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Setup of motion triangle specific parts of ShaderData, moved into this one
@@ -32,8 +34,14 @@ CCL_NAMESPACE_BEGIN
  * normals */
 
 /* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool is_local)
+ccl_device_noinline void motion_triangle_shader_setup(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      const float3 P,
+                                                      const float3 D,
+                                                      const float ray_t,
+                                                      const int isect_object,
+                                                      const int isect_prim,
+                                                      bool is_local)
 {
   /* Get shader. */
   sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
@@ -63,12 +71,12 @@ ccl_device_noinline void motion_triangle_shader_setup(
   /* Compute refined position. */
 #ifdef __BVH_LOCAL__
   if (is_local) {
-    sd->P = motion_triangle_refine_local(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   else
 #endif /* __BVH_LOCAL__*/
   {
-    sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   /* Compute face normal. */
   float3 Ng;
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index fe73335a335..7d6ad7b4fe3 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -22,6 +22,8 @@
  * directly primitives in the BVH with world space locations applied, and the object
  * ID is looked up afterwards. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Object attributes, for now a fixed size and contents */
@@ -35,7 +37,7 @@ enum ObjectVectorTransform { OBJECT_PASS_MOTION_PRE = 0, OBJECT_PASS_MOTION_POST
 
 /* Object to world space transformation */
 
-ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform(const KernelGlobals *kg,
                                                    int object,
                                                    enum ObjectTransform type)
 {
@@ -49,7 +51,7 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
 
 /* Lamp to world space transformation */
 
-ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+ccl_device_inline Transform lamp_fetch_transform(const KernelGlobals *kg, int lamp, bool inverse)
 {
   if (inverse) {
     return kernel_tex_fetch(__lights, lamp).itfm;
@@ -61,7 +63,7 @@ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bo
 
 /* Object to world space transformation for motion vectors */
 
-ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_motion_pass_transform(const KernelGlobals *kg,
                                                                int object,
                                                                enum ObjectVectorTransform type)
 {
@@ -72,7 +74,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg
 /* Motion blurred object transformations */
 
 #ifdef __OBJECT_MOTION__
-ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion(const KernelGlobals *kg,
                                                           int object,
                                                           float time)
 {
@@ -86,7 +88,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
   return tfm;
 }
 
-ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion_test(const KernelGlobals *kg,
                                                                int object,
                                                                float time,
                                                                Transform *itfm)
@@ -111,45 +113,79 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 }
 #endif
 
+/* Get transform matrix for shading point. */
+
+ccl_device_inline Transform object_get_transform(const KernelGlobals *kg, const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_tfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#endif
+}
+
+ccl_device_inline Transform object_get_inverse_transform(const KernelGlobals *kg,
+                                                         const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_itfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+}
 /* Transform position from object to world space */
 
-ccl_device_inline void object_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_position_transform(const KernelGlobals *kg,
                                                  const ShaderData *sd,
                                                  float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_tfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_tfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform position from world to object space */
 
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_position_transform(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_itfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_itfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform normal from world to object space */
 
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_normal_transform(const KernelGlobals *kg,
                                                        const ShaderData *sd,
                                                        float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
-    *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+      *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm_motion, *N));
+    }
+    return;
   }
-#else
+#endif
+
   if (sd->object != OBJECT_NONE) {
     Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
     *N = normalize(transform_direction_transposed(&tfm, *N));
@@ -158,65 +194,79 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
     Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
     *N = normalize(transform_direction_transposed(&tfm, *N));
   }
-#endif
 }
 
 /* Transform normal from object to world space */
 
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
+ccl_device_inline void object_normal_transform(const KernelGlobals *kg,
+                                               const ShaderData *sd,
+                                               float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm_motion, *N));
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
 }
 
 /* Transform direction vector from object to world space */
 
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
+ccl_device_inline void object_dir_transform(const KernelGlobals *kg,
+                                            const ShaderData *sd,
+                                            float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_tfm, *D);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_tfm_motion, *D);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *D = transform_direction(&tfm, *D);
-#endif
 }
 
 /* Transform direction vector from world to object space */
 
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_dir_transform(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_itfm, *D);
-#else
-  Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  *D = transform_direction(&tfm, *D);
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_itfm_motion, *D);
+    return;
+  }
 #endif
+
+  const Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+  *D = transform_direction(&tfm, *D);
 }
 
 /* Object center position */
 
-ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline float3 object_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-  return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    return make_float3(sd->ob_tfm_motion.x.w, sd->ob_tfm_motion.y.w, sd->ob_tfm_motion.z.w);
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-#endif
 }
 
 /* Color of the object */
 
-ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_color(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -227,7 +277,7 @@ ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
 
 /* Pass ID number of object */
 
-ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_pass_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -237,7 +287,7 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 
 /* Per lamp random number for shader variation */
 
-ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
+ccl_device_inline float lamp_random_number(const KernelGlobals *kg, int lamp)
 {
   if (lamp == LAMP_NONE)
     return 0.0f;
@@ -247,7 +297,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
 
 /* Per object random number for shader variation */
 
-ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
+ccl_device_inline float object_random_number(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -257,7 +307,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 
 /* Particle ID from which this object was generated */
 
-ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
+ccl_device_inline int object_particle_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -267,7 +317,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 
 /* Generated texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_generated(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -279,7 +329,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 
 /* UV texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_uv(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -291,7 +341,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 /* Information about mesh for motion blurred triangles and curves */
 
 ccl_device_inline void object_motion_info(
-    KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
+    const KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
 {
   if (numkeys) {
     *numkeys = kernel_tex_fetch(__objects, object).numkeys;
@@ -305,7 +355,7 @@ ccl_device_inline void object_motion_info(
 
 /* Offset to an objects patch map */
 
-ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_patch_map_offset(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -315,7 +365,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 /* Volume step size */
 
-ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_density(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return 1.0f;
@@ -324,7 +374,7 @@ ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).volume_density;
 }
 
-ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_step_size(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return kernel_data.background.volume_step_size;
@@ -335,14 +385,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
 
 /* Pass ID for shader */
 
-ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
+ccl_device int shader_pass_id(const KernelGlobals *kg, const ShaderData *sd)
 {
   return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
 /* Cryptomatte ID */
 
-ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -350,7 +400,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).cryptomatte_object;
 }
 
-ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_asset_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -360,42 +410,42 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int objec
 
 /* Particle data from which object was instanced */
 
-ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline uint particle_index(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).index;
 }
 
-ccl_device float particle_age(KernelGlobals *kg, int particle)
+ccl_device float particle_age(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).age;
 }
 
-ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
+ccl_device float particle_lifetime(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).lifetime;
 }
 
-ccl_device float particle_size(KernelGlobals *kg, int particle)
+ccl_device float particle_size(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).size;
 }
 
-ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
+ccl_device float4 particle_rotation(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).rotation;
 }
 
-ccl_device float3 particle_location(KernelGlobals *kg, int particle)
+ccl_device float3 particle_location(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
 }
 
-ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
 }
 
-ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_angular_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
 }
@@ -418,7 +468,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 /* Transform ray into object space to enter static object in BVH */
 
 ccl_device_inline float bvh_instance_push(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+    const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir)
 {
   Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -428,17 +478,18 @@ ccl_device_inline float bvh_instance_push(
   *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit static object in BVH. */
 
-ccl_device_inline float bvh_instance_pop(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+ccl_device_inline float bvh_instance_pop(const KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
   if (t != FLT_MAX) {
     Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -454,7 +505,7 @@ ccl_device_inline float bvh_instance_pop(
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_pop_factor(const KernelGlobals *kg,
                                                int object,
                                                const Ray *ray,
                                                float3 *P,
@@ -473,13 +524,12 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(const KernelGlobals *kg,
                                                  int object,
                                                  const Ray *ray,
                                                  float3 *P,
                                                  float3 *dir,
                                                  float3 *idir,
-                                                 float t,
                                                  Transform *itfm)
 {
   object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -490,16 +540,12 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
   *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit motion blurred object in BVH. */
 
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_pop(const KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
@@ -521,7 +567,7 @@ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_motion_pop_factor(const KernelGlobals *kg,
                                                       int object,
                                                       const Ray *ray,
                                                       float3 *P,
@@ -538,48 +584,11 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-ccl_device_inline void object_position_transform_addrspace(KernelGlobals *kg,
-                                                           const ShaderData *sd,
-                                                           ccl_addr_space float3 *P)
-{
-  float3 private_P = *P;
-  object_position_transform(kg, sd, &private_P);
-  *P = private_P;
-}
-
-ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
-                                                      const ShaderData *sd,
-                                                      ccl_addr_space float3 *D)
-{
-  float3 private_D = *D;
-  object_dir_transform(kg, sd, &private_D);
-  *D = private_D;
-}
-
-ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
-                                                         const ShaderData *sd,
-                                                         ccl_addr_space float3 *N)
-{
-  float3 private_N = *N;
-  object_normal_transform(kg, sd, &private_N);
-  *N = private_N;
-}
-#endif
-
-#ifndef __KERNEL_OPENCL__
-#  define object_position_transform_auto object_position_transform
-#  define object_dir_transform_auto object_dir_transform
-#  define object_normal_transform_auto object_normal_transform
-#else
-#  define object_position_transform_auto object_position_transform_addrspace
-#  define object_dir_transform_auto object_dir_transform_addrspace
-#  define object_normal_transform_auto object_normal_transform_addrspace
-#endif
+#define object_position_transform_auto object_position_transform
+#define object_dir_transform_auto object_dir_transform
+#define object_normal_transform_auto object_normal_transform
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 9c1768f05db..ce0fc15f196 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -24,6 +24,8 @@
  * language governing permissions and limitations under the Apache License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 typedef struct PatchHandle {
@@ -60,7 +62,7 @@ ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float *
 /* retrieve PatchHandle from patch coords */
 
 ccl_device_inline PatchHandle
-patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v)
+patch_map_find_patch(const KernelGlobals *kg, int object, int patch, float u, float v)
 {
   PatchHandle handle;
 
@@ -191,7 +193,7 @@ ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, fl
 
 /* retrieve patch control indices */
 
-ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
+ccl_device_inline int patch_eval_indices(const KernelGlobals *kg,
                                          const PatchHandle *handle,
                                          int channel,
                                          int indices[PATCH_MAX_CONTROL_VERTS])
@@ -208,7 +210,7 @@ ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
 
 /* evaluate patch basis functions */
 
-ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
+ccl_device_inline void patch_eval_basis(const KernelGlobals *kg,
                                         const PatchHandle *handle,
                                         float u,
                                         float v,
@@ -247,7 +249,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
 
 /* generic function for evaluating indices and weights from patch coords */
 
-ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
+ccl_device_inline int patch_eval_control_verts(const KernelGlobals *kg,
                                                int object,
                                                int patch,
                                                float u,
@@ -269,7 +271,7 @@ ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
 
 /* functions for evaluating attributes on patches */
 
-ccl_device float patch_eval_float(KernelGlobals *kg,
+ccl_device float patch_eval_float(const KernelGlobals *kg,
                                   const ShaderData *sd,
                                   int offset,
                                   int patch,
@@ -306,7 +308,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float2 patch_eval_float2(KernelGlobals *kg,
+ccl_device float2 patch_eval_float2(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -343,7 +345,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float3 patch_eval_float3(KernelGlobals *kg,
+ccl_device float3 patch_eval_float3(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -380,7 +382,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_float4(KernelGlobals *kg,
+ccl_device float4 patch_eval_float4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -417,7 +419,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_uchar4(KernelGlobals *kg,
+ccl_device float4 patch_eval_uchar4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index aeb044c9ad3..ba31b12e817 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -19,6 +19,10 @@
  * Generic functions to look up mesh, curve and volume primitive attributes for
  * shading and render passes. */
 
+#pragma once
+
+#include "kernel/kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Surface Attributes
@@ -27,8 +31,11 @@ CCL_NAMESPACE_BEGIN
  * attributes for performance, mainly for GPU performance to avoid bringing in
  * heavy volume interpolation code. */
 
-ccl_device_inline float primitive_surface_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_inline float primitive_surface_attribute_float(const KernelGlobals *kg,
+                                                          const ShaderData *sd,
+                                                          const AttributeDescriptor desc,
+                                                          float *dx,
+                                                          float *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -50,7 +57,7 @@ ccl_device_inline float primitive_surface_attribute_float(
   }
 }
 
-ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
+ccl_device_inline float2 primitive_surface_attribute_float2(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float2 *dx,
@@ -76,7 +83,7 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_surface_attribute_float3(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float3 *dx,
@@ -102,11 +109,11 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_surface_attribute_float4(KernelGlobals *kg,
-                                                            const ShaderData *sd,
-                                                            const AttributeDescriptor desc,
-                                                            float4 *dx,
-                                                            float4 *dy)
+ccl_device_forceinline float4 primitive_surface_attribute_float4(const KernelGlobals *kg,
+                                                                 const ShaderData *sd,
+                                                                 const AttributeDescriptor desc,
+                                                                 float4 *dx,
+                                                                 float4 *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -141,7 +148,7 @@ ccl_device_inline bool primitive_is_volume_attribute(const ShaderData *sd,
   return sd->type == PRIMITIVE_VOLUME;
 }
 
-ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
+ccl_device_inline float primitive_volume_attribute_float(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          const AttributeDescriptor desc)
 {
@@ -153,7 +160,7 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_volume_attribute_float3(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -165,7 +172,7 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
+ccl_device_inline float4 primitive_volume_attribute_float4(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -180,7 +187,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
 
 /* Default UV coordinate */
 
-ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(const KernelGlobals *kg, const ShaderData *sd)
 {
   const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
 
@@ -193,7 +200,7 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 
 /* Ptex coordinates */
 
-ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
+ccl_device bool primitive_ptex(const KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
 {
   /* storing ptex data as attributes is not memory efficient but simple for tests */
   const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID);
@@ -213,7 +220,7 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 
 /* Surface tangent */
 
-ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_tangent(const KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
   if (sd->type & PRIMITIVE_ALL_CURVE)
@@ -245,7 +252,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* Motion vector for motion pass */
 
-ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(const KernelGlobals *kg, const ShaderData *sd)
 {
   /* center position */
   float3 center;
diff --git a/intern/cycles/kernel/geom/geom_shader_data.h b/intern/cycles/kernel/geom/geom_shader_data.h
new file mode 100644
index 00000000000..fb2cb5cb1ea
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_shader_data.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions to initialize ShaderData given.
+ *
+ * Could be from an incoming ray, intersection or sampled position. */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* ShaderData setup from incoming ray */
+
+#ifdef __OBJECT_MOTION__
+ccl_device void shader_setup_object_transforms(const KernelGlobals *ccl_restrict kg,
+                                               ShaderData *ccl_restrict sd,
+                                               float time)
+{
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
+    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+  }
+}
+#endif
+
+/* TODO: break this up if it helps reduce register pressure to load data from
+ * global memory as we write it to shaderdata. */
+ccl_device_inline void shader_setup_from_ray(const KernelGlobals *ccl_restrict kg,
+                                             ShaderData *ccl_restrict sd,
+                                             const Ray *ccl_restrict ray,
+                                             const Intersection *ccl_restrict isect)
+{
+  /* Read intersection data into shader globals.
+   *
+   * TODO: this is redundant, could potentially remove some of this from
+   * ShaderData but would need to ensure that it also works for shadow
+   * shader evaluation. */
+  sd->u = isect->u;
+  sd->v = isect->v;
+  sd->ray_length = isect->t;
+  sd->type = isect->type;
+  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
+                                                isect->object;
+  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+  sd->lamp = LAMP_NONE;
+  sd->flag = 0;
+
+  /* Read matrices and time. */
+  sd->time = ray->time;
+
+#ifdef __OBJECT_MOTION__
+  shader_setup_object_transforms(kg, sd, ray->time);
+#endif
+
+  /* Read ray data into shader globals. */
+  sd->I = -ray->D;
+
+#ifdef __HAIR__
+  if (sd->type & PRIMITIVE_ALL_CURVE) {
+    /* curve */
+    curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+  }
+  else
+#endif
+      if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* static triangle */
+    float3 Ng = triangle_normal(kg, sd);
+    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+
+    /* vectors */
+    sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+    sd->Ng = Ng;
+    sd->N = Ng;
+
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL)
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+#ifdef __DPDU__
+    /* dPdu/dPdv */
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+#endif
+  }
+  else {
+    /* motion triangle */
+    motion_triangle_shader_setup(
+        kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
+  }
+
+  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
+  if (isect->object != OBJECT_NONE) {
+    /* instance transform */
+    object_normal_transform_auto(kg, sd, &sd->N);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+#ifdef __DPDU__
+    object_dir_transform_auto(kg, sd, &sd->dPdu);
+    object_dir_transform_auto(kg, sd, &sd->dPdv);
+#endif
+  }
+
+  /* backfacing test */
+  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+  if (backfacing) {
+    sd->flag |= SD_BACKFACING;
+    sd->Ng = -sd->Ng;
+    sd->N = -sd->N;
+#ifdef __DPDU__
+    sd->dPdu = -sd->dPdu;
+    sd->dPdv = -sd->dPdv;
+#endif
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length);
+  differential_incoming_compact(&sd->dI, ray->D, ray->dD);
+  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+#endif
+}
+
+/* ShaderData setup from position sampled on mesh */
+
+ccl_device_inline void shader_setup_from_sample(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const float3 P,
+                                                const float3 Ng,
+                                                const float3 I,
+                                                int shader,
+                                                int object,
+                                                int prim,
+                                                float u,
+                                                float v,
+                                                float t,
+                                                float time,
+                                                bool object_space,
+                                                int lamp)
+{
+  /* vectors */
+  sd->P = P;
+  sd->N = Ng;
+  sd->Ng = Ng;
+  sd->I = I;
+  sd->shader = shader;
+  if (prim != PRIM_NONE)
+    sd->type = PRIMITIVE_TRIANGLE;
+  else if (lamp != LAMP_NONE)
+    sd->type = PRIMITIVE_LAMP;
+  else
+    sd->type = PRIMITIVE_NONE;
+
+  /* primitive */
+  sd->object = object;
+  sd->lamp = LAMP_NONE;
+  /* Currently no access to bvh prim index for strand sd->prim. */
+  sd->prim = prim;
+  sd->u = u;
+  sd->v = v;
+  sd->time = time;
+  sd->ray_length = t;
+
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  if (sd->object != OBJECT_NONE) {
+    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+
+#ifdef __OBJECT_MOTION__
+    shader_setup_object_transforms(kg, sd, time);
+#endif
+  }
+  else if (lamp != LAMP_NONE) {
+    sd->lamp = lamp;
+  }
+
+  /* transform into world space */
+  if (object_space) {
+    object_position_transform_auto(kg, sd, &sd->P);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+    sd->N = sd->Ng;
+    object_dir_transform_auto(kg, sd, &sd->I);
+  }
+
+  if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL) {
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+        object_normal_transform_auto(kg, sd, &sd->N);
+      }
+    }
+
+    /* dPdu/dPdv */
+#ifdef __DPDU__
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+
+    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      object_dir_transform_auto(kg, sd, &sd->dPdu);
+      object_dir_transform_auto(kg, sd, &sd->dPdv);
+    }
+#endif
+  }
+  else {
+#ifdef __DPDU__
+    sd->dPdu = zero_float3();
+    sd->dPdv = zero_float3();
+#endif
+  }
+
+  /* backfacing test */
+  if (sd->prim != PRIM_NONE) {
+    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+    if (backfacing) {
+      sd->flag |= SD_BACKFACING;
+      sd->Ng = -sd->Ng;
+      sd->N = -sd->N;
+#ifdef __DPDU__
+      sd->dPdu = -sd->dPdu;
+      sd->dPdv = -sd->dPdv;
+#endif
+    }
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* no ray differentials here yet */
+  sd->dP = differential3_zero();
+  sd->dI = differential3_zero();
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup for displacement */
+
+ccl_device void shader_setup_from_displace(const KernelGlobals *ccl_restrict kg,
+                                           ShaderData *ccl_restrict sd,
+                                           int object,
+                                           int prim,
+                                           float u,
+                                           float v)
+{
+  float3 P, Ng, I = zero_float3();
+  int shader;
+
+  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
+
+  /* force smooth shading for displacement */
+  shader |= SHADER_SMOOTH_NORMAL;
+
+  shader_setup_from_sample(
+      kg,
+      sd,
+      P,
+      Ng,
+      I,
+      shader,
+      object,
+      prim,
+      u,
+      v,
+      0.0f,
+      0.5f,
+      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
+      LAMP_NONE);
+}
+
+/* ShaderData setup from ray into background */
+
+ccl_device_inline void shader_setup_from_background(const KernelGlobals *ccl_restrict kg,
+                                                    ShaderData *ccl_restrict sd,
+                                                    const float3 ray_P,
+                                                    const float3 ray_D,
+                                                    const float ray_time)
+{
+  /* for NDC coordinates */
+  sd->ray_P = ray_P;
+
+  /* vectors */
+  sd->P = ray_D;
+  sd->N = -ray_D;
+  sd->Ng = -ray_D;
+  sd->I = -ray_D;
+  sd->shader = kernel_data.background.surface_shader;
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  sd->time = ray_time;
+  sd->ray_length = 0.0f;
+
+  sd->object = OBJECT_NONE;
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#endif
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO: ray->dP */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup from point inside volume */
+
+#ifdef __VOLUME__
+ccl_device_inline void shader_setup_from_volume(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const Ray *ccl_restrict ray)
+{
+
+  /* vectors */
+  sd->P = ray->P;
+  sd->N = -ray->D;
+  sd->Ng = -ray->D;
+  sd->I = -ray->D;
+  sd->shader = SHADER_NONE;
+  sd->flag = 0;
+  sd->object_flag = 0;
+  sd->time = ray->time;
+  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
+
+  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->type = PRIMITIVE_VOLUME;
+
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#  ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#  endif
+
+#  ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO ray->dD */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#  endif
+
+  /* for NDC coordinates */
+  sd->ray_P = ray->P;
+  sd->ray_dP = ray->dP;
+}
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 9eceb996926..877b2ece15b 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -16,18 +16,20 @@
 
 /* Functions for retrieving attributes on triangles produced from subdivision meshes */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Patch index for triangle, -1 if not subdivision triangle */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd)
 {
   return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
-ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
+ccl_device_inline void subd_triangle_patch_uv(const KernelGlobals *kg,
                                               const ShaderData *sd,
                                               float2 uv[3])
 {
@@ -40,7 +42,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
 
 /* Vertex indices of patch */
 
-ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch)
+ccl_device_inline uint4 subd_triangle_patch_indices(const KernelGlobals *kg, int patch)
 {
   uint4 indices;
 
@@ -54,21 +56,23 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch
 
 /* Originating face for patch */
 
-ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_face(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 4);
 }
 
 /* Number of corners on originating face */
 
-ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_num_corners(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 5) & 0xffff;
 }
 
 /* Indices of the four corners that are used by the patch */
 
-ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4])
+ccl_device_inline void subd_triangle_patch_corners(const KernelGlobals *kg,
+                                                   int patch,
+                                                   int corners[4])
 {
   uint4 data;
 
@@ -99,8 +103,11 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch,
 
 /* Reading attributes on various subdivision triangle elements */
 
-ccl_device_noinline float subd_triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_noinline float subd_triangle_attribute_float(const KernelGlobals *kg,
+                                                        const ShaderData *sd,
+                                                        const AttributeDescriptor desc,
+                                                        float *dx,
+                                                        float *dy)
 {
   int patch = subd_triangle_patch(kg, sd);
 
@@ -235,7 +242,7 @@ ccl_device_noinline float subd_triangle_attribute_float(
   }
 }
 
-ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
+ccl_device_noinline float2 subd_triangle_attribute_float2(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float2 *dx,
@@ -378,7 +385,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
+ccl_device_noinline float3 subd_triangle_attribute_float3(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float3 *dx,
@@ -520,7 +527,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
+ccl_device_noinline float4 subd_triangle_attribute_float4(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index ff7909ca425..910fb122c6d 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -20,10 +20,12 @@
  * ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Normal on triangle. */
-ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 triangle_normal(const KernelGlobals *kg, ShaderData *sd)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
@@ -41,8 +43,14 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 }
 
 /* Point and normal on triangle. */
-ccl_device_inline void triangle_point_normal(
-    KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(const KernelGlobals *kg,
+                                             int object,
+                                             int prim,
+                                             float u,
+                                             float v,
+                                             float3 *P,
+                                             float3 *Ng,
+                                             int *shader)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -67,7 +75,7 @@ ccl_device_inline void triangle_point_normal(
 
 /* Triangle vertex locations */
 
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
+ccl_device_inline void triangle_vertices(const KernelGlobals *kg, int prim, float3 P[3])
 {
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
   P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w + 0));
@@ -77,7 +85,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Triangle vertex locations and vertex normals */
 
-ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
+ccl_device_inline void triangle_vertices_and_normals(const KernelGlobals *kg,
                                                      int prim,
                                                      float3 P[3],
                                                      float3 N[3])
@@ -94,7 +102,7 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
 /* Interpolate smooth vertex normal from vertices */
 
 ccl_device_inline float3
-triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
+triangle_smooth_normal(const KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -108,7 +116,7 @@ triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 }
 
 ccl_device_inline float3 triangle_smooth_normal_unnormalized(
-    KernelGlobals *kg, ShaderData *sd, float3 Ng, int prim, float u, float v)
+    const KernelGlobals *kg, const ShaderData *sd, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -130,7 +138,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
+ccl_device_inline void triangle_dPdudv(const KernelGlobals *kg,
                                        int prim,
                                        ccl_addr_space float3 *dPdu,
                                        ccl_addr_space float3 *dPdv)
@@ -148,8 +156,11 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
 
 /* Reading attributes on various triangle elements */
 
-ccl_device float triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float triangle_attribute_float(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const AttributeDescriptor desc,
+                                          float *dx,
+                                          float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION | ATTR_ELEMENT_CORNER)) {
     float f0, f1, f2;
@@ -195,7 +206,7 @@ ccl_device float triangle_attribute_float(
   }
 }
 
-ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
+ccl_device float2 triangle_attribute_float2(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float2 *dx,
@@ -245,7 +256,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
+ccl_device float3 triangle_attribute_float3(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float3 *dx,
@@ -295,7 +306,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
+ccl_device float4 triangle_attribute_float4(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index b0cce274b94..30b77ebd2eb 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -20,12 +20,17 @@
  * intersection at the cost of more memory usage.
  */
 
+#pragma once
+
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect(const KernelGlobals *kg,
                                           Intersection *isect,
                                           float3 P,
                                           float3 dir,
+                                          float tmax,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
@@ -41,7 +46,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              ssef_verts,
 #else
@@ -78,7 +83,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
  */
 
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect_local(const KernelGlobals *kg,
                                                 LocalIntersection *local_isect,
                                                 float3 P,
                                                 float3 dir,
@@ -192,25 +197,20 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
  * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
  */
 
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine(const KernelGlobals *kg,
                                          ShaderData *sd,
-                                         const Intersection *isect,
-                                         const Ray *ray)
+                                         float3 P,
+                                         float3 D,
+                                         float t,
+                                         const int isect_object,
+                                         const int isect_prim)
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -219,7 +219,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
   P = P + D * t;
 
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -239,13 +239,8 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
     P = P + D * rt;
   }
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -255,28 +250,23 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space for
+/* Same as above, except that t is assumed to be in object space for
  * instancing.
  */
-ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine_local(const KernelGlobals *kg,
                                                ShaderData *sd,
-                                               const Intersection *isect,
-                                               const Ray *ray)
+                                               float3 P,
+                                               float3 D,
+                                               float t,
+                                               const int isect_object,
+                                               const int isect_prim)
 {
 #ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return triangle_refine(kg, sd, isect, ray);
+  /* t is always in world space with OptiX. */
+  return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim);
 #else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -286,7 +276,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   P = P + D * t;
 
 #  ifdef __INTERSECTION_REFINE__
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -307,13 +297,8 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   }
 #  endif /* __INTERSECTION_REFINE__ */
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 809b76245ba..2bcd7e56b5f 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -23,13 +23,15 @@
  * 3D voxel textures can be assigned as attributes per mesh, which means the
  * same shader can be used for volume objects with different densities, etc. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+ccl_device_inline float3 volume_normalized_position(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 P)
 {
@@ -68,7 +70,7 @@ ccl_device float3 volume_attribute_value_to_float3(const float4 value)
   }
 }
 
-ccl_device float4 volume_attribute_float4(KernelGlobals *kg,
+ccl_device float4 volume_attribute_float4(const KernelGlobals *kg,
                                           const ShaderData *sd,
                                           const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
new file mode 100644
index 00000000000..4898ff936c6
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/geom/geom.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* This helps with AA but it's not the real solution as it does not AA the geometry
+ * but it's better than nothing, thus committed. */
+ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
+{
+  /* use mirror repeat (like opengl texture) so that if the barycentric
+   * coordinate goes past the end of the triangle it is not always clamped
+   * to the same value, gives ugly patterns */
+  u /= max;
+  float fu = floorf(u);
+  u = u - fu;
+
+  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_bake(INTEGRATOR_STATE_ARGS,
+                                          const ccl_global KernelWorkTile *ccl_restrict tile,
+                                          ccl_global float *render_buffer,
+                                          const int x,
+                                          const int y,
+                                          const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Always count the sample, even if the camera sample will reject the ray. */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Setup render buffers. */
+  const int index = INTEGRATOR_STATE(path, render_pixel_index);
+  const int pass_stride = kernel_data.film.pass_stride;
+  render_buffer += index * pass_stride;
+
+  ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;
+
+  const int seed = __float_as_uint(primitive[0]);
+  int prim = __float_as_uint(primitive[1]);
+  if (prim == -1) {
+    return false;
+  }
+
+  prim += kernel_data.bake.tri_offset;
+
+  /* Random number generator. */
+  const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
+
+  float filter_x, filter_y;
+  if (sample == 0) {
+    filter_x = filter_y = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Barycentric UV with sub-pixel offset. */
+  float u = primitive[2];
+  float v = primitive[3];
+
+  float dudx = differential[0];
+  float dudy = differential[1];
+  float dvdx = differential[2];
+  float dvdy = differential[3];
+
+  if (sample > 0) {
+    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
+    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+                                 1.0f - u);
+  }
+
+  /* Position and normal on triangle. */
+  float3 P, Ng;
+  int shader;
+  triangle_point_normal(kg, kernel_data.bake.object_index, prim, u, v, &P, &Ng, &shader);
+  shader &= SHADER_MASK;
+
+  if (kernel_data.film.pass_background != PASS_UNUSED) {
+    /* Environment baking. */
+
+    /* Setup and write ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = zero_float3();
+    ray.D = normalize(P);
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup next kernel to execute. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+  else {
+    /* Surface baking. */
+
+    /* Setup ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = P + Ng;
+    ray.D = -Ng;
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+
+    /* Setup differentials. */
+    float3 dPdu, dPdv;
+    triangle_dPdudv(kg, prim, &dPdu, &dPdv);
+    differential3 dP;
+    dP.dx = dPdu * dudx + dPdv * dvdx;
+    dP.dy = dPdu * dudy + dPdv * dvdy;
+    ray.dP = differential_make_compact(dP);
+    ray.dD = differential_zero_compact();
+
+    /* Write ray. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup and write intersection. */
+    Intersection isect ccl_optional_struct_init;
+    isect.object = kernel_data.bake.object_index;
+    isect.prim = prim;
+    isect.u = u;
+    isect.v = v;
+    isect.t = 1.0f;
+    isect.type = PRIMITIVE_TRIANGLE;
+#ifdef __EMBREE__
+    isect.Ng = Ng;
+#endif
+    integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+    /* Setup next kernel to execute. */
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
new file mode 100644
index 00000000000..58e7bde4c94
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_camera_sample(const KernelGlobals *ccl_restrict kg,
+                                               const int sample,
+                                               const int x,
+                                               const int y,
+                                               const uint rng_hash,
+                                               Ray *ray)
+{
+  /* Filter sampling. */
+  float filter_u, filter_v;
+
+  if (sample == 0) {
+    filter_u = 0.5f;
+    filter_v = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
+  }
+
+  /* Depth of field sampling. */
+  float lens_u = 0.0f, lens_v = 0.0f;
+  if (kernel_data.cam.aperturesize > 0.0f) {
+    path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
+  }
+
+  /* Motion blur time sampling. */
+  float time = 0.0f;
+#ifdef __CAMERA_MOTION__
+  if (kernel_data.cam.shuttertime != -1.0f)
+    time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
+#endif
+
+  /* Generate camera ray. */
+  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_camera(INTEGRATOR_STATE_ARGS,
+                                            const ccl_global KernelWorkTile *ccl_restrict tile,
+                                            ccl_global float *render_buffer,
+                                            const int x,
+                                            const int y,
+                                            const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Count the sample and get an effective sample for this pixel.
+   *
+   * This logic allows to both count actual number of samples per pixel, and to add samples to this
+   * pixel after it was converged and samples were added somewhere else (in which case the
+   * `scheduled_sample` will be different from actual number of samples in this pixel). */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Initialize random number seed for path. */
+  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
+
+  {
+    /* Generate camera ray. */
+    Ray ray;
+    integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
+    if (ray.t == 0.0f) {
+      return true;
+    }
+
+    /* Write camera ray to state. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Continue with intersect_closest kernel, optionally initializing volume
+   * stack before that if the camera may be inside a volume. */
+  if (kernel_data.cam.is_inside_volume) {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+  }
+  else {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
new file mode 100644
index 00000000000..34ca6814534
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<uint32_t current_kernel>
+ccl_device_forceinline bool integrator_intersect_terminate(INTEGRATOR_STATE_ARGS,
+                                                           const int shader_flags)
+{
+
+  /* Optional AO bounce termination.
+   * We continue evaluating emissive/transparent surfaces and volumes, similar
+   * to direct lighting. Only if we know there are none can we terminate the
+   * path immediately. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    if (shader_flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    }
+    else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_VOLUME;
+    }
+    else {
+      return true;
+    }
+  }
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* We perform path termination in this kernel to avoid launching shade_surface
+   * and evaluating the shader when not needed. Only for emission and transparent
+   * surfaces in front of emission do we need to evaluate the shader, since we
+   * perform MIS as part of indirect rays. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = path_state_continuation_probability(INTEGRATOR_STATE_PASS, path_flag);
+
+  if (probability != 1.0f) {
+    const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
+
+    if (probability == 0.0f || terminate >= probability) {
+      if (shader_flags & SD_HAS_EMISSION) {
+        /* Mark path to be terminated right after shader evaluation on the surface. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
+      }
+      else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        /* TODO: only do this for emissive volumes. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IN_NEXT_VOLUME;
+      }
+      else {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
+    INTEGRATOR_STATE_ARGS,
+    const Intersection *ccl_restrict isect,
+    const int shader,
+    const int shader_flags)
+{
+  /* Note on scheduling.
+   *
+   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+   * or without raytrace support, depending on the shader used.
+   *
+   * When there is a shadow catcher split the general idea is to have the following configuration:
+   *
+   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+   *    will trace shadow catcher object.
+   *
+   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+   *    the matte ray.
+   *
+   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+   *    on. The background kernel will then schedule surface shading for the matte ray.
+   *
+   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
+   * the matte path. */
+
+  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
+                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_NEXT_SORTED(
+        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+
+#ifdef __SHADOW_CATCHER__
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags)) {
+    if (kernel_data.film.use_approximate_shadow_catcher && !kernel_data.background.transparent) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+      else {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+    }
+    else if (use_raytrace_kernel) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+  kernel_assert(ray.t != 0.0f);
+
+  const uint visibility = path_state_ray_visibility(INTEGRATOR_STATE_PASS);
+  const int last_isect_prim = INTEGRATOR_STATE(isect, prim);
+  const int last_isect_object = INTEGRATOR_STATE(isect, object);
+
+  /* Trick to use short AO rays to approximate indirect light at the end of the path. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+
+    const int last_object = last_isect_object != OBJECT_NONE ?
+                                last_isect_object :
+                                kernel_tex_fetch(__prim_object, last_isect_prim);
+    const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
+    if (object_ao_distance != 0.0f) {
+      ray.t = object_ao_distance;
+    }
+  }
+
+  /* Scene Intersection. */
+  Intersection isect ccl_optional_struct_init;
+  bool hit = scene_intersect(kg, &ray, visibility, &isect);
+
+  /* TODO: remove this and do it in the various intersection functions instead. */
+  if (!hit) {
+    isect.prim = PRIM_NONE;
+  }
+
+  /* Light intersection for MIS. */
+  if (kernel_data.integrator.use_lamp_mis) {
+    /* NOTE: if we make lights visible to camera rays, we'll need to initialize
+     * these in the path_state_init. */
+    const int last_type = INTEGRATOR_STATE(isect, type);
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+
+    hit = lights_intersect(
+              kg, &ray, &isect, last_isect_prim, last_isect_object, last_type, path_flag) ||
+          hit;
+  }
+
+  /* Write intersection result into global integrator state memory. */
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, flags)) {
+      /* Continue with volume kernel if we are inside a volume, regardless
+       * if we hit anything. */
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+              INTEGRATOR_STATE_PASS, flags)) {
+        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, &isect, shader, flags);
+        return;
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+        return;
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
new file mode 100644
index 00000000000..5bd9cfda4a4
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Visibility for the shadow ray. */
+ccl_device_forceinline uint integrate_intersect_shadow_visibility(INTEGRATOR_STATE_CONST_ARGS)
+{
+  uint visibility = PATH_RAY_SHADOW;
+
+#ifdef __SHADOW_CATCHER__
+  const uint32_t path_flag = INTEGRATOR_STATE(shadow_path, flag);
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+#endif
+
+  return visibility;
+}
+
+ccl_device bool integrate_intersect_shadow_opaque(INTEGRATOR_STATE_ARGS,
+                                                  const Ray *ray,
+                                                  const uint visibility)
+{
+  /* Mask which will pick only opaque visibility bits from the `visibility`.
+   * Calculate the mask at compile time: the visibility will either be a high bits for the shadow
+   * catcher objects, or lower bits for the regular objects (there is no need to check the path
+   * state here again). */
+  constexpr const uint opaque_mask = SHADOW_CATCHER_VISIBILITY_SHIFT(PATH_RAY_SHADOW_OPAQUE) |
+                                     PATH_RAY_SHADOW_OPAQUE;
+
+  Intersection isect;
+  const bool opaque_hit = scene_intersect(kg, ray, visibility & opaque_mask, &isect);
+
+  if (!opaque_hit) {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+
+ccl_device_forceinline int integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_CONST_ARGS)
+{
+  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+  const int transparent_bounce = INTEGRATOR_STATE(shadow_path, transparent_bounce);
+
+  return max(transparent_max_bounce - transparent_bounce - 1, 0);
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ray,
+                                                       const uint visibility)
+{
+  Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
+
+  /* Limit the number hits to the max transparent bounces allowed and the size that we
+   * have available in the integrator state. */
+  const uint max_transparent_hits = integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_PASS);
+  const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
+  uint num_hits = 0;
+  bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
+
+  /* If number of hits exceed the transparent bounces limit, make opaque. */
+  if (num_hits > max_transparent_hits) {
+    opaque_hit = true;
+  }
+
+  if (!opaque_hit) {
+    uint num_recorded_hits = min(num_hits, max_hits);
+
+    if (num_recorded_hits > 0) {
+      sort_intersections(isect, num_recorded_hits);
+
+      /* Write intersection result into global integrator state memory. */
+      for (int hit = 0; hit < num_recorded_hits; hit++) {
+        integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
+      }
+    }
+
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = num_hits;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+#endif
+
+ccl_device void integrator_intersect_shadow(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Compute visibility. */
+  const uint visibility = integrate_intersect_shadow_visibility(INTEGRATOR_STATE_PASS);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* TODO: compile different kernels depending on this? Especially for OptiX
+   * conditional trace calls are bad. */
+  const bool opaque_hit =
+      (kernel_data.integrator.transparent_shadows) ?
+          integrate_intersect_shadow_transparent(INTEGRATOR_STATE_PASS, &ray, visibility) :
+          integrate_intersect_shadow_opaque(INTEGRATOR_STATE_PASS, &ray, visibility);
+#else
+  const bool opaque_hit = integrate_intersect_shadow_opaque(
+      INTEGRATOR_STATE_PASS, &ray, visibility);
+#endif
+
+  if (opaque_hit) {
+    /* Hit an opaque surface, shadow path ends here. */
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    /* Hit nothing or transparent surfaces, continue to shadow kernel
+     * for shading and render buffer output.
+     *
+     * TODO: could also write to render buffer directly if no transparent shadows?
+     * Could save a kernel execution for the common case. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index c10ecc426c6..7c090952dc7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,23 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
+#pragma once
 
-__kernel void kernel_ocl_path_trace_state_buffer_size(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        uint num_threads,
-        ccl_global uint64_t *size)
+#include "kernel/integrator/integrator_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
 {
-	((KernelGlobals*)kg)->data = data;
-	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SUBSURFACE);
+
+#ifdef __SUBSURFACE__
+  if (subsurface_scatter(INTEGRATOR_STATE_PASS)) {
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
new file mode 100644
index 00000000000..60d8a8e3e54
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_ARGS,
+                                                              const float3 from_P,
+                                                              const float3 to_P)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  kernel_assert(kernel_data.integrator.use_volumes);
+
+  Ray volume_ray ccl_optional_struct_init;
+  volume_ray.P = from_P;
+  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+  if (num_hits > 0) {
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+    }
+  }
+#else
+  Intersection isect;
+  int step = 0;
+  while (step < 2 * VOLUME_STACK_SIZE &&
+         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    if (volume_ray.t != FLT_MAX) {
+      volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
+    }
+    ++step;
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  Ray volume_ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &volume_ray);
+  volume_ray.t = FLT_MAX;
+
+  const uint visibility = (INTEGRATOR_STATE(path, flag) & PATH_RAY_ALL_VISIBILITY);
+  int stack_index = 0, enclosed_index = 0;
+
+  /* Write background shader. */
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+    integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+    stack_index++;
+  }
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+  if (num_hits > 0) {
+    int enclosed_volumes[VOLUME_STACK_SIZE];
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      if (stack_sd->flag & SD_BACKFACING) {
+        bool need_add = true;
+        for (int i = 0; i < enclosed_index && need_add; ++i) {
+          /* If ray exited the volume and never entered to that volume
+           * it means that camera is inside such a volume.
+           */
+          if (enclosed_volumes[i] == stack_sd->object) {
+            need_add = false;
+          }
+        }
+        for (int i = 0; i < stack_index && need_add; ++i) {
+          /* Don't add intersections twice. */
+          VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+          if (entry.object == stack_sd->object) {
+            need_add = false;
+            break;
+          }
+        }
+        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+          const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+          integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+          ++stack_index;
+        }
+      }
+      else {
+        /* If ray from camera enters the volume, this volume shouldn't
+         * be added to the stack on exit.
+         */
+        enclosed_volumes[enclosed_index++] = stack_sd->object;
+      }
+    }
+  }
+#else
+  int enclosed_volumes[VOLUME_STACK_SIZE];
+  int step = 0;
+
+  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
+         step < 2 * VOLUME_STACK_SIZE) {
+    Intersection isect;
+    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+      break;
+    }
+
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    if (stack_sd->flag & SD_BACKFACING) {
+      /* If ray exited the volume and never entered to that volume
+       * it means that camera is inside such a volume.
+       */
+      bool need_add = true;
+      for (int i = 0; i < enclosed_index && need_add; ++i) {
+        /* If ray exited the volume and never entered to that volume
+         * it means that camera is inside such a volume.
+         */
+        if (enclosed_volumes[i] == stack_sd->object) {
+          need_add = false;
+        }
+      }
+      for (int i = 0; i < stack_index && need_add; ++i) {
+        /* Don't add intersections twice. */
+        VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+        if (entry.object == stack_sd->object) {
+          need_add = false;
+          break;
+        }
+      }
+      if (need_add) {
+        const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+        ++stack_index;
+      }
+    }
+    else {
+      /* If ray from camera enters the volume, this volume shouldn't
+       * be added to the stack on exit.
+       */
+      enclosed_volumes[enclosed_index++] = stack_sd->object;
+    }
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    ++step;
+  }
+#endif
+
+  /* Write terminator. */
+  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
+  integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+
+  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
new file mode 100644
index 00000000000..91363ea1c7f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_megakernel(INTEGRATOR_STATE_ARGS,
+                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Each kernel indicates the next kernel to execute, so here we simply
+   * have to check what that kernel is and execute it.
+   *
+   * TODO: investigate if we can use device side enqueue for GPUs to avoid
+   * having to compile this big kernel. */
+  while (true) {
+    if (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+      /* First handle any shadow paths before we potentially create more shadow paths. */
+      switch (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else if (INTEGRATOR_STATE(path, queued_kernel)) {
+      /* Then handle regular path kernels. */
+      switch (INTEGRATOR_STATE(path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+          integrator_intersect_closest(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+          integrator_shade_background(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+          integrator_shade_surface(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+          integrator_shade_volume(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+          integrator_shade_surface_raytrace(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+          integrator_shade_light(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+          integrator_intersect_subsurface(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+          integrator_intersect_volume_stack(INTEGRATOR_STATE_PASS);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else {
+      break;
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
new file mode 100644
index 00000000000..3e4cc837e9b
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float3 integrator_eval_background_shader(INTEGRATOR_STATE_ARGS,
+                                                    ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __BACKGROUND__
+  const int shader = kernel_data.background.surface_shader;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Use visibility flag to skip lights. */
+  if (shader & SHADER_EXCLUDE_ANY) {
+    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((shader & SHADER_EXCLUDE_GLOSSY) && ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+                                              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+        ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return zero_float3();
+  }
+
+  /* Use fast constant background color if available. */
+  float3 L = zero_float3();
+  if (!shader_constant_emission_eval(kg, shader, &L)) {
+    /* Evaluate background shader. */
+
+    /* TODO: does aliasing like this break automatic SoA in CUDA?
+     * Should we instead store closures separate from ShaderData? */
+    ShaderDataTinyStorage emission_sd_storage;
+    ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+    shader_setup_from_background(kg,
+                                 emission_sd,
+                                 INTEGRATOR_STATE(ray, P),
+                                 INTEGRATOR_STATE(ray, D),
+                                 INTEGRATOR_STATE(ray, time));
+
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+    L = shader_background_eval(emission_sd);
+  }
+
+  /* Background MIS weights. */
+#  ifdef __BACKGROUND_MIS__
+  /* Check if background light exists or if we should skip pdf. */
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
+    const float3 ray_P = INTEGRATOR_STATE(ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(ray, D);
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* multiple importance sampling, get background light pdf for ray
+     * direction, and compute weight with respect to BSDF pdf */
+    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
+    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
+
+    L *= mis_weight;
+  }
+#  endif
+
+  return L;
+#else
+  return make_float3(0.8f, 0.8f, 0.8f);
+#endif
+}
+
+ccl_device_inline void integrate_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  /* Accumulate transparency for transparent background. We can skip background
+   * shader evaluation unless a background pass is used. */
+  bool eval_background = true;
+  float transparent = 0.0f;
+
+  const bool is_transparent_background_ray = kernel_data.background.transparent &&
+                                             (INTEGRATOR_STATE(path, flag) &
+                                              PATH_RAY_TRANSPARENT_BACKGROUND);
+
+  if (is_transparent_background_ray) {
+    transparent = average(INTEGRATOR_STATE(path, throughput));
+
+#ifdef __PASSES__
+    eval_background = (kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND));
+#else
+    eval_background = false;
+#endif
+  }
+
+  /* Evaluate background shader. */
+  float3 L = (eval_background) ?
+                 integrator_eval_background_shader(INTEGRATOR_STATE_PASS, render_buffer) :
+                 zero_float3();
+
+  /* When using the ao bounces approximation, adjust background
+   * shader intensity with ao factor. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    L *= kernel_data.integrator.ao_bounces_factor;
+  }
+
+  /* Write to render buffer. */
+  kernel_accum_background(
+      INTEGRATOR_STATE_PASS, L, transparent, is_transparent_background_ray, render_buffer);
+}
+
+ccl_device_inline void integrate_distant_lights(INTEGRATOR_STATE_ARGS,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+  LightSample ls ccl_optional_struct_init;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) {
+      /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+      const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+      if (ls.shader & SHADER_EXCLUDE_ANY) {
+        if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+            ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+             ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+            ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+            ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+            ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+          return;
+      }
+#endif
+
+      /* Evaluate light shader. */
+      /* TODO: does aliasing like this break automatic SoA in CUDA? */
+      ShaderDataTinyStorage emission_sd_storage;
+      ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+      float3 light_eval = light_sample_shader_eval(
+          INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+      if (is_zero(light_eval)) {
+        return;
+      }
+
+      /* MIS weighting. */
+      if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+        /* multiple importance sampling, get regular light pdf,
+         * and compute weight with respect to BSDF pdf */
+        const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        light_eval *= mis_weight;
+      }
+
+      /* Write to render buffer. */
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+    }
+  }
+}
+
+ccl_device void integrator_shade_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  /* TODO: unify these in a single loop to only have a single shader evaluation call. */
+  integrate_distant_lights(INTEGRATOR_STATE_PASS, render_buffer);
+  integrate_background(INTEGRATOR_STATE_PASS, render_buffer);
+
+#ifdef __SHADOW_CATCHER__
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+    const int isect_prim = INTEGRATOR_STATE(isect, prim);
+    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim);
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                  shader);
+    }
+    else {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                  shader);
+    }
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/integrator_shade_light.h
new file mode 100644
index 00000000000..05b530f9665
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_light.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  /* Setup light sample. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  float3 ray_P = INTEGRATOR_STATE(ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+
+  /* Advance ray beyond light. */
+  /* TODO: can we make this more numerically robust to avoid reintersecting the
+   * same light in some cases? */
+  const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D);
+  INTEGRATOR_STATE_WRITE(ray, P) = new_ray_P;
+  INTEGRATOR_STATE_WRITE(ray, t) -= isect.t;
+
+  /* Set position to where the BSDF was sampled, for correct MIS PDF. */
+  const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+  ray_P -= ray_D * mis_ray_t;
+  isect.t += mis_ray_t;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = mis_ray_t + isect.t;
+
+  LightSample ls ccl_optional_struct_init;
+  const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
+
+  if (!use_light_sample) {
+    return;
+  }
+
+  /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (ls.shader & SHADER_EXCLUDE_ANY) {
+    if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+         ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return;
+  }
+#endif
+
+  /* Evaluate light shader. */
+  /* TODO: does aliasing like this break automatic SoA in CUDA? */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  float3 light_eval = light_sample_shader_eval(INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* MIS weighting. */
+  if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+    /* multiple importance sampling, get regular light pdf,
+     * and compute weight with respect to BSDF pdf */
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    light_eval *= mis_weight;
+  }
+
+  /* Write to render buffer. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+}
+
+ccl_device void integrator_shade_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  integrate_light(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* TODO: we could get stuck in an infinite loop if there are precision issues
+   * and the same light is hit again.
+   *
+   * As a workaround count this as a transparent bounce. It makes some sense
+   * to interpret lights as transparent surfaces (and support making them opaque),
+   * but this needs to be revisited. */
+  uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+
+  if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    return;
+  }
+  else {
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+
+  /* TODO: in some cases we could continue directly to SHADE_BACKGROUND, but
+   * probably that optimization is probably not practical if we add lights to
+   * scene geometry. */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
new file mode 100644
index 00000000000..fd3c3ae1653
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)
+{
+  return num_hits >= INTEGRATOR_SHADOW_ISECT_SIZE;
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device_inline float3 integrate_transparent_surface_shadow(INTEGRATOR_STATE_ARGS, const int hit)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
+
+  /* TODO: does aliasing like this break automatic SoA in CUDA?
+   * Should we instead store closures separate from ShaderData?
+   *
+   * TODO: is it better to declare this outside the loop or keep it local
+   * so the compiler can see there is no dependency between iterations? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data at surface. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_shadow_isect(INTEGRATOR_STATE_PASS, &isect, hit);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, shadow_sd, &ray, &isect);
+
+  /* Evaluate shader. */
+  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+        INTEGRATOR_STATE_PASS, shadow_sd, NULL, PATH_RAY_SHADOW);
+  }
+
+#  ifdef __VOLUME__
+  /* Exit/enter volume. */
+  shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, shadow_sd);
+#  endif
+
+  /* Compute transparency from closures. */
+  return shader_bsdf_transparency(kg, shadow_sd);
+}
+
+#  ifdef __VOLUME__
+ccl_device_inline void integrate_transparent_volume_shadow(INTEGRATOR_STATE_ARGS,
+                                                           const int hit,
+                                                           const int num_recorded_hits,
+                                                           float3 *ccl_restrict throughput)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
+
+  /* TODO: deduplicate with surface, or does it not matter for memory usage? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Modify ray position and length to match current segment. */
+  const float start_t = (hit == 0) ? 0.0f : INTEGRATOR_STATE_ARRAY(shadow_isect, hit - 1, t);
+  const float end_t = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(shadow_isect, hit, t) :
+                                                  ray.t;
+  ray.P += start_t * ray.D;
+  ray.t = end_t - start_t;
+
+  shader_setup_from_volume(kg, shadow_sd, &ray);
+
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  volume_shadow_heterogeneous(INTEGRATOR_STATE_PASS, &ray, shadow_sd, throughput, step_size);
+}
+#  endif
+
+ccl_device_inline bool integrate_transparent_shadow(INTEGRATOR_STATE_ARGS, const int num_hits)
+{
+  /* Accumulate shadow for transparent surfaces. */
+  const int num_recorded_hits = min(num_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+
+  for (int hit = 0; hit < num_recorded_hits + 1; hit++) {
+    /* Volume shaders. */
+    if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
+#  ifdef __VOLUME__
+      if (!integrator_state_shadow_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        float3 throughput = INTEGRATOR_STATE(shadow_path, throughput);
+        integrate_transparent_volume_shadow(
+            INTEGRATOR_STATE_PASS, hit, num_recorded_hits, &throughput);
+        if (is_zero(throughput)) {
+          return true;
+        }
+
+        INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      }
+#  endif
+    }
+
+    /* Surface shaders. */
+    if (hit < num_recorded_hits) {
+      const float3 shadow = integrate_transparent_surface_shadow(INTEGRATOR_STATE_PASS, hit);
+      const float3 throughput = INTEGRATOR_STATE(shadow_path, throughput) * shadow;
+      if (is_zero(throughput)) {
+        return true;
+      }
+
+      INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) += 1;
+    }
+
+    /* Note we do not need to check max_transparent_bounce here, the number
+     * of intersections is already limited and made opaque in the
+     * INTERSECT_SHADOW kernel. */
+  }
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* There are more hits that we could not recorded due to memory usage,
+     * adjust ray to intersect again from the last hit. */
+    const float last_hit_t = INTEGRATOR_STATE_ARRAY(shadow_isect, num_recorded_hits - 1, t);
+    const float3 ray_P = INTEGRATOR_STATE(shadow_ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(shadow_ray, D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, t) -= last_hit_t;
+  }
+
+  return false;
+}
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device void integrator_shade_shadow(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
+  const int num_hits = INTEGRATOR_STATE(shadow_path, num_hits);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* Evaluate transparent shadows. */
+  const bool opaque = integrate_transparent_shadow(INTEGRATOR_STATE_PASS, num_hits);
+  if (opaque) {
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+#endif
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* More intersections to find, continue shadow ray. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    kernel_accum_light(INTEGRATOR_STATE_PASS, render_buffer);
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
new file mode 100644
index 00000000000..73b7cad32be
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_subsurface.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline void integrate_surface_shader_setup(INTEGRATOR_STATE_CONST_ARGS,
+                                                           ShaderData *sd)
+{
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, sd, &ray, &isect);
+}
+
+#ifdef __HOLDOUT__
+ccl_device_forceinline bool integrate_surface_holdout(INTEGRATOR_STATE_CONST_ARGS,
+                                                      ShaderData *sd,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Write holdout transparency to render buffer and stop if fully holdout. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+      (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+    const float3 holdout_weight = shader_holdout_apply(kg, sd);
+    if (kernel_data.background.transparent) {
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      const float transparent = average(holdout_weight * throughput);
+      kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+    }
+    if (isequal_float3(holdout_weight, one_float3())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+#endif /* __HOLDOUT__ */
+
+#ifdef __EMISSION__
+ccl_device_forceinline void integrate_surface_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                                       const ShaderData *sd,
+                                                       ccl_global float *ccl_restrict
+                                                           render_buffer)
+{
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Evaluate emissive closure. */
+  float3 L = shader_emissive_eval(sd);
+
+#  ifdef __HAIR__
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
+      (sd->type & PRIMITIVE_ALL_TRIANGLE))
+#  else
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+#  endif
+  {
+    const float bsdf_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float t = sd->ray_length + INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* Multiple importance sampling, get triangle light pdf,
+     * and compute weight with respect to BSDF pdf. */
+    float pdf = triangle_light_pdf(kg, sd, t);
+    float mis_weight = power_heuristic(bsdf_pdf, pdf);
+
+    L *= mis_weight;
+  }
+
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, L, render_buffer);
+}
+#endif /* __EMISSION__ */
+
+#ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) {
+    return;
+  }
+
+  /* Sample position on a light. */
+  LightSample ls ccl_optional_struct_init;
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+      return;
+    }
+  }
+
+  kernel_assert(ls.pdf != 0.0f);
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, &ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
+
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+  bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
+
+  if (ls.shader & SHADER_USE_MIS) {
+    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    bsdf_eval_mul(&bsdf_eval, mis_weight);
+  }
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, &ls, &bsdf_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
+  const bool is_light = light_sample_is_light(&ls);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  if (is_transmission) {
+#  ifdef __VOLUME__
+    shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, sd);
+#  endif
+  }
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+  const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#endif
+
+/* Path tracing: bounce off or through surface with new direction. */
+ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE_ARGS,
+                                                                ShaderData *sd,
+                                                                const RNGState *rng_state)
+{
+  /* Sample BSDF or BSSRDF. */
+  if (!(sd->flag & (SD_BSDF | SD_BSSRDF))) {
+    return LABEL_NONE;
+  }
+
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+  const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+
+#ifdef __SUBSURFACE__
+  /* BSSRDF closure, we schedule subsurface intersection kernel. */
+  if (CLOSURE_IS_BSSRDF(sc->type)) {
+    return subsurface_bounce(INTEGRATOR_STATE_PASS, sd, sc);
+  }
+#endif
+
+  /* BSDF closure, sample direction. */
+  float bsdf_pdf;
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  float3 bsdf_omega_in ccl_optional_struct_init;
+  differential3 bsdf_domega_in ccl_optional_struct_init;
+  int label;
+
+  label = shader_bsdf_sample_closure(
+      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray. Note that clipping works through transparent bounces. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(bsdf_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = (label & LABEL_TRANSPARENT) ?
+                                       INTEGRATOR_STATE(ray, t) - sd->ray_length :
+                                       FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(bsdf_domega_in);
+#endif
+
+  /* Update throughput. */
+  float3 throughput = INTEGRATOR_STATE(path, throughput);
+  throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path,
+                             diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+    }
+  }
+
+  /* Update path state */
+  if (label & LABEL_TRANSPARENT) {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = bsdf_pdf;
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(bsdf_pdf,
+                                                      INTEGRATOR_STATE(path, min_ray_pdf));
+  }
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return label;
+}
+
+#ifdef __VOLUME__
+ccl_device_forceinline bool integrate_surface_volume_only_bounce(INTEGRATOR_STATE_ARGS,
+                                                                 ShaderData *sd)
+{
+  if (!path_state_volume_next(INTEGRATOR_STATE_PASS)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray position, direction stays unchanged. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, -sd->Ng);
+
+  /* Clipping works through transparent. */
+  INTEGRATOR_STATE_WRITE(ray, t) -= sd->ray_length;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+#  endif
+
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+
+  return LABEL_TRANSMIT | LABEL_TRANSPARENT;
+}
+#endif
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+ccl_device_forceinline void integrate_surface_ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  ifdef __KERNEL_OPTIX__
+  optixDirectCall<void>(2, INTEGRATOR_STATE_PASS, sd, rng_state, render_buffer);
+}
+
+extern "C" __device__ void __direct_callable__ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  endif /* __KERNEL_OPTIX__ */
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+  const float3 ao_N = shader_bsdf_ao_normal(kg, sd);
+  float3 ao_D;
+  float ao_pdf;
+  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+    Ray ray ccl_optional_struct_init;
+    ray.P = ray_offset(sd->P, sd->Ng);
+    ray.D = ao_D;
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+    ray.time = sd->time;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+
+    Intersection isect ccl_optional_struct_init;
+    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
+      ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                                 render_buffer);
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
+    }
+  }
+}
+#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+
+template<uint node_feature_mask>
+ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
+                                  ccl_global float *ccl_restrict render_buffer)
+
+{
+  PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_SURFACE_SETUP);
+
+  /* Setup shader data. */
+  ShaderData sd;
+  integrate_surface_shader_setup(INTEGRATOR_STATE_PASS, &sd);
+  PROFILING_SHADER(sd.object, sd.shader);
+
+  int continue_path_label = 0;
+
+  /* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+  if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
+
+    {
+      const int path_flag = INTEGRATOR_STATE(path, flag);
+#ifdef __SUBSURFACE__
+      /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+      if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+#endif
+      {
+        /* Evaluate shader. */
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+        shader_eval_surface<node_feature_mask>(
+            INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
+      }
+    }
+
+#ifdef __SUBSURFACE__
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      /* When coming from inside subsurface scattering, setup a diffuse
+       * closure to perform lighting at the exit point. */
+      INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SUBSURFACE;
+      subsurface_shader_data_setup(INTEGRATOR_STATE_PASS, &sd);
+    }
+#endif
+
+    shader_prepare_surface_closures(INTEGRATOR_STATE_PASS, &sd);
+
+#ifdef __HOLDOUT__
+    /* Evaluate holdout. */
+    if (!integrate_surface_holdout(INTEGRATOR_STATE_PASS, &sd, render_buffer)) {
+      return false;
+    }
+#endif
+
+#ifdef __EMISSION__
+    /* Write emission. */
+    if (sd.flag & SD_EMISSION) {
+      integrate_surface_emission(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+    }
+#endif
+
+#ifdef __PASSES__
+    /* Write render passes. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
+    kernel_write_data_passes(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Load random number state. */
+    RNGState rng_state;
+    path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+    /* Perform path termination. Most paths have already been terminated in
+     * the intersect_closest kernel, this is just for emission and for dividing
+     * throughput by the probability at the right moment. */
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                  0.0f :
+                                  path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                      path_flag);
+    if (probability == 0.0f) {
+      return false;
+    }
+    else if (probability != 1.0f) {
+      INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+    }
+
+#ifdef __DENOISING_FEATURES__
+    kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+#ifdef __SHADOW_CATCHER__
+    kernel_write_shadow_catcher_bounce_data(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Direct light. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
+    integrate_surface_direct_light(INTEGRATOR_STATE_PASS, &sd, &rng_state);
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+    /* Ambient occlusion pass. */
+    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+          (INTEGRATOR_STATE(path, flag) & PATH_RAY_CAMERA)) {
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+        integrate_surface_ao_pass(INTEGRATOR_STATE_PASS, &sd, &rng_state, render_buffer);
+      }
+    }
+#endif
+
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_bsdf_bssrdf_bounce(
+        INTEGRATOR_STATE_PASS, &sd, &rng_state);
+#ifdef __VOLUME__
+  }
+  else {
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_volume_only_bounce(INTEGRATOR_STATE_PASS, &sd);
+  }
+
+  if (continue_path_label & LABEL_TRANSMIT) {
+    /* Enter/Exit volume. */
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, &sd);
+  }
+#endif
+
+  return continue_path_label != 0;
+}
+
+template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
+         int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+ccl_device_forceinline void integrator_shade_surface(INTEGRATOR_STATE_ARGS,
+                                                     ccl_global float *ccl_restrict render_buffer)
+{
+  if (integrate_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, render_buffer)) {
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+    }
+    else {
+      kernel_assert(INTEGRATOR_STATE(ray, t) != 0.0f);
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+  }
+  else {
+    INTEGRATOR_PATH_TERMINATE(current_kernel);
+  }
+}
+
+ccl_device_forceinline void integrator_shade_surface_raytrace(
+    INTEGRATOR_STATE_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE>(INTEGRATOR_STATE_PASS,
+                                                                            render_buffer);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
new file mode 100644
index 00000000000..4a864b1e6ce
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -0,0 +1,1015 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Events for probalistic scattering */
+
+typedef enum VolumeIntegrateEvent {
+  VOLUME_PATH_SCATTERED = 0,
+  VOLUME_PATH_ATTENUATED = 1,
+  VOLUME_PATH_MISSED = 2
+} VolumeIntegrateEvent;
+
+typedef struct VolumeIntegrateResult {
+  /* Throughput and offset for direct light scattering. */
+  bool direct_scatter;
+  float3 direct_throughput;
+  float direct_t;
+  ShaderVolumePhases direct_phases;
+
+  /* Throughput and offset for indirect light scattering. */
+  bool indirect_scatter;
+  float3 indirect_throughput;
+  float indirect_t;
+  ShaderVolumePhases indirect_phases;
+} VolumeIntegrateResult;
+
+/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
+ * and precision issues.
+ * todo: this value could be tweaked or turned into a probability to avoid unnecessary
+ * work in volumes and subsurface scattering. */
+#  define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+/* Volume shader properties
+ *
+ * extinction coefficient = absorption coefficient + scattering coefficient
+ * sigma_t = sigma_a + sigma_s */
+
+typedef struct VolumeShaderCoefficients {
+  float3 sigma_t;
+  float3 sigma_s;
+  float3 emission;
+} VolumeShaderCoefficients;
+
+/* Evaluate shader to get extinction coefficient at P. */
+ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                                   ShaderData *ccl_restrict sd,
+                                                   float3 *ccl_restrict extinction)
+{
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & SD_EXTINCTION)) {
+    return false;
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  *extinction = sd->closure_transparent_extinction * density;
+  return true;
+}
+
+/* Evaluate shader to get absorption, scattering and emission at P. */
+ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                            ShaderData *ccl_restrict sd,
+                                            VolumeShaderCoefficients *coeff)
+{
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
+    return false;
+  }
+
+  coeff->sigma_s = zero_float3();
+  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
+  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+
+  if (sd->flag & SD_SCATTER) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_VOLUME(sc->type)) {
+        coeff->sigma_s += sc->weight;
+      }
+    }
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  coeff->sigma_s *= density;
+  coeff->sigma_t *= density;
+  coeff->emission *= density;
+
+  return true;
+}
+
+ccl_device_forceinline void volume_step_init(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             const float object_step_size,
+                                             float t,
+                                             float *step_size,
+                                             float *step_shade_offset,
+                                             float *steps_offset,
+                                             int *max_steps)
+{
+  if (object_step_size == FLT_MAX) {
+    /* Homogeneous volume. */
+    *step_size = t;
+    *step_shade_offset = 0.0f;
+    *steps_offset = 1.0f;
+    *max_steps = 1;
+  }
+  else {
+    /* Heterogeneous volume. */
+    *max_steps = kernel_data.integrator.volume_max_steps;
+    float step = min(object_step_size, t);
+
+    /* compute exact steps in advance for malloc */
+    if (t > *max_steps * step) {
+      step = t / (float)*max_steps;
+    }
+
+    *step_size = step;
+
+    /* Perform shading at this offset within a step, to integrate over
+     * over the entire step segment. */
+    *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+
+    /* Shift starting point of all segment by this random amount to avoid
+     * banding artifacts from the volume bounding shape. */
+    *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+  }
+}
+
+/* Volume Shadows
+ *
+ * These functions are used to attenuate shadow rays to lights. Both absorption
+ * and scattering will block light, represented by the extinction coefficient. */
+
+#  if 0
+/* homogeneous volume: assume shader evaluation at the starts gives
+ * the extinction coefficient for the entire line segment */
+ccl_device void volume_shadow_homogeneous(INTEGRATOR_STATE_ARGS,
+                                          Ray *ccl_restrict ray,
+                                          ShaderData *ccl_restrict sd,
+                                          float3 *ccl_restrict throughput)
+{
+  float3 sigma_t = zero_float3();
+
+  if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+    *throughput *= volume_color_transmittance(sigma_t, ray->t);
+  }
+}
+#  endif
+
+/* heterogeneous volume: integrate stepping through the volume until we
+ * reach the end, get absorbed entirely, or run out of iterations */
+ccl_device void volume_shadow_heterogeneous(INTEGRATOR_STATE_ARGS,
+                                            Ray *ccl_restrict ray,
+                                            ShaderData *ccl_restrict sd,
+                                            float3 *ccl_restrict throughput,
+                                            const float object_step_size)
+{
+  /* Load random number state. */
+  RNGState rng_state;
+  shadow_path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  float3 tp = *throughput;
+
+  /* Prepare for stepping.
+   * For shadows we do not offset all segments, since the starting point is
+   * already a random distance inside the volume. It also appears to create
+   * banding artifacts for unknown reasons. */
+  int max_steps;
+  float step_size, step_shade_offset, unused;
+  volume_step_init(kg,
+                   &rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &unused,
+                   &max_steps);
+  const float steps_offset = 1.0f;
+
+  /* compute extinction at the start */
+  float t = 0.0f;
+
+  float3 sum = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* advance to new position */
+    float new_t = min(ray->t, (i + steps_offset) * step_size);
+    float dt = new_t - t;
+
+    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
+    float3 sigma_t = zero_float3();
+
+    /* compute attenuation over segment */
+    sd->P = new_P;
+    if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+      /* Compute expf() only for every Nth step, to save some calculations
+       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
+       * check then. */
+      sum += (-sigma_t * dt);
+      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
+        tp = *throughput * exp3(sum);
+
+        /* stop if nearly all light is blocked */
+        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
+            tp.z < VOLUME_THROUGHPUT_EPSILON)
+          break;
+      }
+    }
+
+    /* stop if at the end of the volume */
+    t = new_t;
+    if (t == ray->t) {
+      /* Update throughput in case we haven't done it above */
+      tp = *throughput * exp3(sum);
+      break;
+    }
+  }
+
+  *throughput = tp;
+}
+
+/* Equi-angular sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+ccl_device float volume_equiangular_sample(const Ray *ccl_restrict ray,
+                                           const float3 light_P,
+                                           const float xi,
+                                           float *pdf)
+{
+  const float t = ray->t;
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
+  if (UNLIKELY(theta_b == theta_a)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return min(t, delta + t_); /* min is only for float precision errors */
+}
+
+ccl_device float volume_equiangular_pdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return pdf;
+}
+
+ccl_device float volume_equiangular_cdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  float delta = dot((light_P - ray->P), ray->D);
+  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float theta_sample = atan2f(t_, D);
+  const float cdf = (theta_sample - theta_a) / (theta_b - theta_a);
+
+  return cdf;
+}
+
+/* Distance sampling */
+
+ccl_device float volume_distance_sample(
+    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
+{
+  /* xi is [0, 1[ so log(0) should never happen, division by zero is
+   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+  float sample_sigma_t = volume_channel_get(sigma_t, channel);
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float sample_transmittance = volume_channel_get(full_transmittance, channel);
+
+  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
+
+  *transmittance = volume_color_transmittance(sigma_t, sample_t);
+  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+
+  /* todo: optimization: when taken together with hit/miss decision,
+   * the full_transmittance cancels out drops out and xi does not
+   * need to be remapped */
+
+  return sample_t;
+}
+
+ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+{
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+
+  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+}
+
+/* Emission */
+
+ccl_device float3 volume_emission_integrate(VolumeShaderCoefficients *coeff,
+                                            int closure_flag,
+                                            float3 transmittance,
+                                            float t)
+{
+  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+   * this goes to E * t as sigma_t goes to zero
+   *
+   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+  float3 emission = coeff->emission;
+
+  if (closure_flag & SD_EXTINCTION) {
+    float3 sigma_t = coeff->sigma_t;
+
+    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
+    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
+    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+  }
+  else
+    emission *= t;
+
+  return emission;
+}
+
+/* Volume Integration */
+
+typedef struct VolumeIntegrateState {
+  /* Volume segment extents. */
+  float start_t;
+  float end_t;
+
+  /* If volume is absorption-only up to this point, and no probabilistic
+   * scattering or termination has been used yet. */
+  bool absorption_only;
+
+  /* Random numbers for scattering. */
+  float rscatter;
+  float rphase;
+
+  /* Multiple importance sampling. */
+  VolumeSampleMethod direct_sample_method;
+  bool use_mis;
+  float distance_pdf;
+  float equiangular_pdf;
+} VolumeIntegrateState;
+
+ccl_device_forceinline void volume_integrate_step_scattering(
+    const ShaderData *sd,
+    const Ray *ray,
+    const float3 equiangular_light_P,
+    const VolumeShaderCoefficients &ccl_restrict coeff,
+    const float3 transmittance,
+    VolumeIntegrateState &ccl_restrict vstate,
+    VolumeIntegrateResult &ccl_restrict result)
+{
+  /* Pick random color channel, we use the Veach one-sample
+   * model with balance heuristic for the channels. */
+  const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+  float3 channel_pdf;
+  const int channel = volume_sample_channel(
+      albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
+
+  /* Equiangular sampling for direct lighting. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
+    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
+      const float new_dt = result.direct_t - vstate.start_t;
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+
+      result.direct_scatter = true;
+      result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
+      shader_copy_volume_phases(&result.direct_phases, sd);
+
+      /* Multiple importance sampling. */
+      if (vstate.use_mis) {
+        const float distance_pdf = vstate.distance_pdf *
+                                   dot(channel_pdf, coeff.sigma_t * new_transmittance);
+        const float mis_weight = 2.0f * power_heuristic(vstate.equiangular_pdf, distance_pdf);
+        result.direct_throughput *= mis_weight;
+      }
+    }
+    else {
+      result.direct_throughput *= transmittance;
+      vstate.distance_pdf *= dot(channel_pdf, transmittance);
+    }
+  }
+
+  /* Distance sampling for indirect and optional direct lighting. */
+  if (!result.indirect_scatter) {
+    /* decide if we will scatter or continue */
+    const float sample_transmittance = volume_channel_get(transmittance, channel);
+
+    if (1.0f - vstate.rscatter >= sample_transmittance) {
+      /* compute sampling distance */
+      const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
+      const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
+      const float new_t = vstate.start_t + new_dt;
+
+      /* transmittance and pdf */
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
+
+      /* throughput */
+      result.indirect_scatter = true;
+      result.indirect_t = new_t;
+      result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+      shader_copy_volume_phases(&result.indirect_phases, sd);
+
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        /* If using distance sampling for direct light, just copy parameters
+         * of indirect light since we scatter at the same point then. */
+        result.direct_scatter = true;
+        result.direct_t = result.indirect_t;
+        result.direct_throughput = result.indirect_throughput;
+        shader_copy_volume_phases(&result.direct_phases, sd);
+
+        /* Multiple importance sampling. */
+        if (vstate.use_mis) {
+          const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+          const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+                                                   equiangular_pdf);
+          result.direct_throughput *= 2.0f * mis_weight;
+        }
+      }
+    }
+    else {
+      /* throughput */
+      const float pdf = dot(channel_pdf, transmittance);
+      result.indirect_throughput *= transmittance / pdf;
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        vstate.distance_pdf *= pdf;
+      }
+
+      /* remap rscatter so we can reuse it and keep thing stratified */
+      vstate.rscatter = 1.0f - (1.0f - vstate.rscatter) / sample_transmittance;
+    }
+  }
+}
+
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probabilistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device_forceinline void volume_integrate_heterogeneous(
+    INTEGRATOR_STATE_ARGS,
+    Ray *ccl_restrict ray,
+    ShaderData *ccl_restrict sd,
+    const RNGState *rng_state,
+    ccl_global float *ccl_restrict render_buffer,
+    const float object_step_size,
+    const VolumeSampleMethod direct_sample_method,
+    const float3 equiangular_light_P,
+    VolumeIntegrateResult &result)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE);
+
+  /* Prepare for stepping.
+   * Using a different step offset for the first step avoids banding artifacts. */
+  int max_steps;
+  float step_size, step_shade_offset, steps_offset;
+  volume_step_init(kg,
+                   rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &steps_offset,
+                   &max_steps);
+
+  /* Initialize volume integration state. */
+  VolumeIntegrateState vstate ccl_optional_struct_init;
+  vstate.start_t = 0.0f;
+  vstate.end_t = 0.0f;
+  vstate.absorption_only = true;
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
+  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+
+  /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
+  vstate.direct_sample_method = direct_sample_method;
+  vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS);
+  if (vstate.use_mis) {
+    if (vstate.rscatter < 0.5f) {
+      vstate.rscatter *= 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE;
+    }
+    else {
+      vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+  }
+  vstate.equiangular_pdf = 0.0f;
+  vstate.distance_pdf = 1.0f;
+
+  /* Initialize volume integration result. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  result.direct_throughput = throughput;
+  result.indirect_throughput = throughput;
+
+  /* Equiangular sampling: compute distance and PDF in advance. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) {
+    result.direct_t = volume_equiangular_sample(
+        ray, equiangular_light_P, vstate.rscatter, &vstate.equiangular_pdf);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  const bool write_denoising_features = (INTEGRATOR_STATE(path, flag) &
+                                         PATH_RAY_DENOISING_FEATURES);
+  float3 accum_albedo = zero_float3();
+#  endif
+  float3 accum_emission = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* Advance to new position */
+    vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
+    const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+    sd->P = ray->P + ray->D * shade_t;
+
+    /* compute segment */
+    VolumeShaderCoefficients coeff ccl_optional_struct_init;
+    if (volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &coeff)) {
+      const int closure_flag = sd->flag;
+
+      /* Evaluate transmittance over segment. */
+      const float dt = (vstate.end_t - vstate.start_t);
+      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
+                                       volume_color_transmittance(coeff.sigma_t, dt) :
+                                       one_float3();
+
+      /* Emission. */
+      if (closure_flag & SD_EMISSION) {
+        /* Only write emission before indirect light scatter position, since we terminate
+         * stepping at that point if we have already found a direct light scatter position. */
+        if (!result.indirect_scatter) {
+          const float3 emission = volume_emission_integrate(
+              &coeff, closure_flag, transmittance, dt);
+          accum_emission += emission;
+        }
+      }
+
+      if (closure_flag & SD_EXTINCTION) {
+        if ((closure_flag & SD_SCATTER) || !vstate.absorption_only) {
+#  ifdef __DENOISING_FEATURES__
+          /* Accumulate albedo for denoising features. */
+          if (write_denoising_features && (closure_flag & SD_SCATTER)) {
+            const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+            accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+          }
+#  endif
+
+          /* Scattering and absorption. */
+          volume_integrate_step_scattering(
+              sd, ray, equiangular_light_P, coeff, transmittance, vstate, result);
+        }
+        else {
+          /* Absorption only. */
+          result.indirect_throughput *= transmittance;
+          result.direct_throughput *= transmittance;
+        }
+
+        /* Stop if nearly all light blocked. */
+        if (!result.indirect_scatter) {
+          if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            result.indirect_throughput = zero_float3();
+            break;
+          }
+        }
+        else if (!result.direct_scatter) {
+          if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            break;
+          }
+        }
+      }
+
+      /* If we have scattering data for both direct and indirect, we're done. */
+      if (result.direct_scatter && result.indirect_scatter) {
+        break;
+      }
+    }
+
+    /* Stop if at the end of the volume. */
+    vstate.start_t = vstate.end_t;
+    if (vstate.start_t == ray->t) {
+      break;
+    }
+  }
+
+  /* Write accumulated emisison. */
+  if (!is_zero(accum_emission)) {
+    kernel_accum_emission(
+        INTEGRATOR_STATE_PASS, result.indirect_throughput, accum_emission, render_buffer);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  /* Write denoising features. */
+  if (write_denoising_features) {
+    kernel_write_denoising_features_volume(
+        INTEGRATOR_STATE_PASS, accum_albedo, result.indirect_scatter, render_buffer);
+  }
+#  endif /* __DENOISING_FEATURES__ */
+}
+
+#  ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline bool integrate_volume_sample_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          LightSample *ccl_restrict ls)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!kernel_data.integrator.use_direct_light) {
+    return false;
+  }
+
+  /* Sample position on a light. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const uint bounce = INTEGRATOR_STATE(path, bounce);
+  float light_u, light_v;
+  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+  light_distribution_sample_from_volume_segment(
+      kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls);
+
+  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+    return false;
+  }
+
+  return true;
+}
+
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          const float3 P,
+                                                          const ShaderVolumePhases *ccl_restrict
+                                                              phases,
+                                                          const float3 throughput,
+                                                          LightSample *ccl_restrict ls)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
+
+  if (!kernel_data.integrator.use_direct_light) {
+    return;
+  }
+
+  /* Sample position on the same light again, now from the shading
+   * point where we scattered.
+   *
+   * TODO: decorrelate random numbers and use light_sample_new_position to
+   * avoid resampling the CDF. */
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+      return;
+    }
+  }
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  BsdfEval phase_eval ccl_optional_struct_init;
+  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+
+  if (ls->shader & SHADER_USE_MIS) {
+    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    bsdf_eval_mul(&phase_eval, mis_weight);
+  }
+
+  bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf);
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, ls, &phase_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
+  const bool is_light = light_sample_is_light(ls);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= PATH_RAY_VOLUME_PASS;
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#  endif
+
+/* Path tracing: scatter in new direction using phase function */
+ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state,
+                                                           const ShaderVolumePhases *phases)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
+
+  float phase_u, phase_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+
+  /* Phase closure, sample direction. */
+  float phase_pdf;
+  BsdfEval phase_eval ccl_optional_struct_init;
+  float3 phase_omega_in ccl_optional_struct_init;
+  differential3 phase_domega_in ccl_optional_struct_init;
+
+  const int label = shader_volume_phase_sample(kg,
+                                               sd,
+                                               phases,
+                                               phase_u,
+                                               phase_v,
+                                               &phase_eval,
+                                               &phase_omega_in,
+                                               &phase_domega_in,
+                                               &phase_pdf);
+
+  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+    return false;
+  }
+
+  /* Setup ray. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(phase_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(phase_domega_in);
+#  endif
+
+  /* Update throughput. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+  }
+
+  /* Update path state */
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(phase_pdf,
+                                                    INTEGRATOR_STATE(path, min_ray_pdf));
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return true;
+}
+
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device VolumeIntegrateEvent volume_integrate(INTEGRATOR_STATE_ARGS,
+                                                 Ray *ccl_restrict ray,
+                                                 ccl_global float *ccl_restrict render_buffer)
+{
+  ShaderData sd;
+  shader_setup_from_volume(kg, &sd, ray);
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* Sample light ahead of volume stepping, for equiangular sampling. */
+  /* TODO: distant lights are ignored now, but could instead use even distribution. */
+  LightSample ls ccl_optional_struct_init;
+  const bool need_light_sample = !(INTEGRATOR_STATE(path, flag) & PATH_RAY_TERMINATE);
+  const bool have_equiangular_sample = need_light_sample &&
+                                       integrate_volume_sample_light(
+                                           INTEGRATOR_STATE_PASS, &sd, &rng_state, &ls) &&
+                                       (ls.t != FLT_MAX);
+
+  VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ?
+                                                volume_stack_sample_method(INTEGRATOR_STATE_PASS) :
+                                                VOLUME_SAMPLE_DISTANCE;
+
+  /* Step through volume. */
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  /* TODO: expensive to zero closures? */
+  VolumeIntegrateResult result = {};
+  volume_integrate_heterogeneous(INTEGRATOR_STATE_PASS,
+                                 ray,
+                                 &sd,
+                                 &rng_state,
+                                 render_buffer,
+                                 step_size,
+                                 direct_sample_method,
+                                 ls.P,
+                                 result);
+
+  /* Perform path termination. The intersect_closest will have already marked this path
+   * to be terminated. That will shading evaluating to leave out any scattering closures,
+   * but emission and absorption are still handled for multiple importance sampling. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
+                                0.0f :
+                                path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                    path_flag);
+  if (probability == 0.0f) {
+    return VOLUME_PATH_MISSED;
+  }
+
+  /* Direct light. */
+  if (result.direct_scatter) {
+    const float3 direct_P = ray->P + result.direct_t * ray->D;
+    result.direct_throughput /= probability;
+    integrate_volume_direct_light(INTEGRATOR_STATE_PASS,
+                                  &sd,
+                                  &rng_state,
+                                  direct_P,
+                                  &result.direct_phases,
+                                  result.direct_throughput,
+                                  &ls);
+  }
+
+  /* Indirect light.
+   *
+   * Only divide throughput by probability if we scatter. For the attenuation
+   * case the next surface will already do this division. */
+  if (result.indirect_scatter) {
+    result.indirect_throughput /= probability;
+  }
+  INTEGRATOR_STATE_WRITE(path, throughput) = result.indirect_throughput;
+
+  if (result.indirect_scatter) {
+    sd.P = ray->P + result.indirect_t * ray->D;
+
+    if (integrate_volume_phase_scatter(
+            INTEGRATOR_STATE_PASS, &sd, &rng_state, &result.indirect_phases)) {
+      return VOLUME_PATH_SCATTERED;
+    }
+    else {
+      return VOLUME_PATH_MISSED;
+    }
+  }
+  else {
+    return VOLUME_PATH_ATTENUATED;
+  }
+}
+
+#endif
+
+ccl_device void integrator_shade_volume(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_SETUP);
+
+#ifdef __VOLUME__
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  /* Set ray length to current segment. */
+  ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+
+  /* Clean volume stack for background rays. */
+  if (isect.prim == PRIM_NONE) {
+    volume_stack_clean(INTEGRATOR_STATE_PASS);
+  }
+
+  VolumeIntegrateEvent event = volume_integrate(INTEGRATOR_STATE_PASS, &ray, render_buffer);
+
+  if (event == VOLUME_PATH_SCATTERED) {
+    /* Queue intersect_closest kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+  else if (event == VOLUME_PATH_MISSED) {
+    /* End path. */
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    return;
+  }
+  else {
+    /* Continue to background, light or surface. */
+    if (isect.prim == PRIM_NONE) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      return;
+    }
+    else if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+          INTEGRATOR_STATE_PASS, &isect, shader, flags);
+      return;
+    }
+  }
+#endif /* __VOLUME__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
new file mode 100644
index 00000000000..8cef9cf31e2
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Integrator State
+ *
+ * This file defines the data structures that define the state of a path. Any state that is
+ * preserved and passed between kernel executions is part of this.
+ *
+ * The size of this state must be kept as small as possible, to reduce cache misses and keep memory
+ * usage under control on GPUs that may execute millions of kernels.
+ *
+ * Memory may be allocated and passed along in different ways depending on the device. There may
+ * be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer
+ * to every kernel, or the pointer may exist at program scope or in constant memory. To abstract
+ * these differences between devices and experiment with different layouts, macros are used.
+ *
+ * INTEGRATOR_STATE_ARGS: prepend to argument definitions for every function that accesses
+ * path state.
+ * INTEGRATOR_STATE_CONST_ARGS: same as INTEGRATOR_STATE_ARGS, when state is read-only
+ * INTEGRATOR_STATE_PASS: use to pass along state to other functions access it.
+ *
+ * INTEGRATOR_STATE(x, y): read nested struct member x.y of IntegratorState
+ * INTEGRATOR_STATE_WRITE(x, y): write to nested struct member x.y of IntegratorState
+ *
+ * INTEGRATOR_STATE_ARRAY(x, index, y): read x[index].y
+ * INTEGRATOR_STATE_ARRAY_WRITE(x, index, y): write x[index].y
+ *
+ * INTEGRATOR_STATE_COPY(to_x, from_x): copy contents of one nested struct to another
+ *
+ * INTEGRATOR_STATE_IS_NULL: test if any integrator state is available, for shader evaluation
+ * INTEGRATOR_STATE_PASS_NULL: use to pass empty state to other functions.
+ *
+ * NOTE: if we end up with a device that passes no arguments, the leading comma will be a problem.
+ * Can solve it with more macros if we encouter it, but rather ugly so postpone for now.
+ */
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_types.h"
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Constants
+ *
+ * TODO: these could be made dynamic depending on the features used in the scene. */
+
+#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
+#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+/* Data structures */
+
+/* Integrator State
+ *
+ * CPU rendering path state with AoS layout. */
+typedef struct IntegratorStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorStateCPU;
+
+/* Path Queue
+ *
+ * Keep track of which kernels are queued to be executed next in the path
+ * for GPU rendering. */
+typedef struct IntegratorQueueCounter {
+  int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
+} IntegratorQueueCounter;
+
+/* Integrator State GPU
+ *
+ * GPU rendering path state with SoA layout. */
+typedef struct IntegratorStateGPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+
+  /* Count number of queued kernels. */
+  IntegratorQueueCounter *queue_counter;
+
+  /* Count number of kernels queued for specific shaders. */
+  int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
+
+  /* Index of path which will be used by a next shadow catcher split.  */
+  int *next_shadow_catcher_path_index;
+} IntegratorStateGPU;
+
+/* Abstraction
+ *
+ * Macros to access data structures on different devices.
+ *
+ * Note that there is a special access function for the shadow catcher state. This access is to
+ * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
+ * from a kernel which operates on a shadow catcher state will cause bad memory acces. */
+
+#ifdef __KERNEL_CPU__
+
+/* Scalar access on CPU. */
+
+typedef IntegratorStateCPU *ccl_restrict IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        const IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, NULL
+#  define INTEGRATOR_STATE_IS_NULL (state == NULL)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct.member)
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) (state->nested_struct.member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct[array_index].member)
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    ((state)->nested_struct[array_index].member)
+
+#else /* __KERNEL_CPU__ */
+
+/* Array access on GPU with Structure-of-Arrays. */
+
+typedef int IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, -1
+#  define INTEGRATOR_STATE_IS_NULL (state == -1)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    kernel_integrator_state.nested_struct.member[state]
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) INTEGRATOR_STATE(nested_struct, member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    kernel_integrator_state.nested_struct[array_index].member[state]
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member)
+
+#endif /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
new file mode 100644
index 00000000000..8477efd7b66
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+#include "util/util_atomic.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Control Flow
+ *
+ * Utilities for control flow between kernels. The implementation may differ per device
+ * or even be handled on the host side. To abstract such differences, experiment with
+ * different implementations and for debugging, this is abstracted using macros.
+ *
+ * There is a main path for regular path tracing camera for path tracing. Shadows for next
+ * event estimation branch off from this into their own path, that may be computed in
+ * parallel while the main path continues.
+ *
+ * Each kernel on the main path must call one of these functions. These may not be called
+ * multiple times from the same kernel.
+ *
+ * INTEGRATOR_PATH_INIT(next_kernel)
+ * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
+ * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ *
+ * For the shadow path similar functions are used, and again each shadow kernel must call
+ * one of them, and only once.
+ */
+
+#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(path, queued_kernel) == 0)
+#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0)
+
+#ifdef __KERNEL_GPU__
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_sub_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+
+#else
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+    }
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+      (void)current_kernel; \
+    }
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
new file mode 100644
index 00000000000..41dd1bfcdbf
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -0,0 +1,163 @@
+
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/************************************ Path State *****************************/
+
+KERNEL_STRUCT_BEGIN(path)
+/* Index of a pixel within the device render buffer where this path will write its result.
+ * To get an actual offset within the buffer the value needs to be multiplied by the
+ * `kernel_data.film.pass_stride`.
+ *
+ * The multiplication is delayed for later, so that state can use 32bit integer. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume bounds ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Multiple importance sampling
+ * The PDF of BSDF sampling at the last scatter point, and distance to the
+ * last scatter point minus the last ray segment. This distance lets us
+ * compute the complete distance through transparent surfaces and volumes. */
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
+/* Filter glossy. */
+KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+/* Shader sorting. */
+/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(path)
+
+/************************************** Ray ***********************************/
+
+KERNEL_STRUCT_BEGIN(ray)
+KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(ray)
+
+/*************************** Intersection result ******************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(isect)
+KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_MEMBER(isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(isect)
+
+/*************** Subsurface closure state for subsurface kernel ***************/
+
+KERNEL_STRUCT_BEGIN(subsurface)
+KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, roughness, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_END(subsurface)
+
+/********************************** Volume Stack ******************************/
+
+KERNEL_STRUCT_BEGIN(volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
new file mode 100644
index 00000000000..cdf412fe22f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/kernel_differential.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray */
+
+ccl_device_forceinline void integrator_state_write_ray(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(ray, dP) = ray->dP;
+  INTEGRATOR_STATE_WRITE(ray, dD) = ray->dD;
+}
+
+ccl_device_forceinline void integrator_state_read_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                      Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(ray, P);
+  ray->D = INTEGRATOR_STATE(ray, D);
+  ray->t = INTEGRATOR_STATE(ray, t);
+  ray->time = INTEGRATOR_STATE(ray, time);
+  ray->dP = INTEGRATOR_STATE(ray, dP);
+  ray->dD = INTEGRATOR_STATE(ray, dD);
+}
+
+/* Shadow Ray */
+
+ccl_device_forceinline void integrator_state_write_shadow_ray(INTEGRATOR_STATE_ARGS,
+                                                              const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(shadow_ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(shadow_ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(shadow_ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(shadow_ray, dP) = ray->dP;
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                             Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(shadow_ray, P);
+  ray->D = INTEGRATOR_STATE(shadow_ray, D);
+  ray->t = INTEGRATOR_STATE(shadow_ray, t);
+  ray->time = INTEGRATOR_STATE(shadow_ray, time);
+  ray->dP = INTEGRATOR_STATE(shadow_ray, dP);
+  ray->dD = differential_zero_compact();
+}
+
+/* Intersection */
+
+ccl_device_forceinline void integrator_state_write_isect(INTEGRATOR_STATE_ARGS,
+                                                         const Intersection *ccl_restrict isect)
+{
+  INTEGRATOR_STATE_WRITE(isect, t) = isect->t;
+  INTEGRATOR_STATE_WRITE(isect, u) = isect->u;
+  INTEGRATOR_STATE_WRITE(isect, v) = isect->v;
+  INTEGRATOR_STATE_WRITE(isect, object) = isect->object;
+  INTEGRATOR_STATE_WRITE(isect, prim) = isect->prim;
+  INTEGRATOR_STATE_WRITE(isect, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_WRITE(isect, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                        Intersection *ccl_restrict isect)
+{
+  isect->prim = INTEGRATOR_STATE(isect, prim);
+  isect->object = INTEGRATOR_STATE(isect, object);
+  isect->type = INTEGRATOR_STATE(isect, type);
+  isect->u = INTEGRATOR_STATE(isect, u);
+  isect->v = INTEGRATOR_STATE(isect, v);
+  isect->t = INTEGRATOR_STATE(isect, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE(isect, Ng);
+#endif
+}
+
+ccl_device_forceinline VolumeStack integrator_state_read_volume_stack(INTEGRATOR_STATE_CONST_ARGS,
+                                                                      int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline void integrator_state_write_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                int i,
+                                                                VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, shader) = entry.shader;
+}
+
+ccl_device_forceinline bool integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+/* Shadow Intersection */
+
+ccl_device_forceinline void integrator_state_write_shadow_isect(
+    INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect, const int index)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, t) = isect->t;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, u) = isect->u;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, v) = isect->v;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, object) = isect->object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, prim) = isect->prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                               Intersection *ccl_restrict isect,
+                                                               const int index)
+{
+  isect->prim = INTEGRATOR_STATE_ARRAY(shadow_isect, index, prim);
+  isect->object = INTEGRATOR_STATE_ARRAY(shadow_isect, index, object);
+  isect->type = INTEGRATOR_STATE_ARRAY(shadow_isect, index, type);
+  isect->u = INTEGRATOR_STATE_ARRAY(shadow_isect, index, u);
+  isect->v = INTEGRATOR_STATE_ARRAY(shadow_isect, index, v);
+  isect->t = INTEGRATOR_STATE_ARRAY(shadow_isect, index, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE_ARRAY(shadow_isect, index, Ng);
+#endif
+}
+
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, object);
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, shader);
+    }
+  }
+}
+
+ccl_device_forceinline VolumeStack
+integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_CONST_ARGS, int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
+    INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(shadow_volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                       int i,
+                                                                       VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = entry.shader;
+}
+
+#if defined(__KERNEL_GPU__)
+ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state,
+                                                  const IntegratorState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+    ++index; \
+    } \
+    while (index < array_size) \
+      ;
+
+#  include "kernel/integrator/integrator_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+}
+
+ccl_device_inline void integrator_state_move(const IntegratorState to_state,
+                                             const IntegratorState state)
+{
+  integrator_state_copy_only(to_state, state);
+
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
+
+#endif
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_ARGS)
+{
+#if defined(__KERNEL_GPU__)
+  const IntegratorState to_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);
+
+  integrator_state_copy_only(to_state, state);
+
+  kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  /* Sanity check: expect to split in the intersect-closest kernel, where there is no shadow ray
+   * and no sorting yet. */
+  kernel_assert(INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+  kernel_assert(kernel_integrator_state.sort_key_counter[INTEGRATOR_STATE(path, queued_kernel)] ==
+                nullptr);
+#else
+
+  IntegratorStateCPU *ccl_restrict split_state = state + 1;
+
+  *split_state = *state;
+
+  split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
new file mode 100644
index 00000000000..9490738404e
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -0,0 +1,623 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/volume.h"
+
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+
+ccl_device int subsurface_bounce(INTEGRATOR_STATE_ARGS, ShaderData *sd, const ShaderClosure *sc)
+{
+  /* We should never have two consecutive BSSRDF bounces, the second one should
+   * be converted to a diffuse BSDF to avoid this. */
+  kernel_assert(!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DIFFUSE_ANCESTOR));
+
+  /* Setup path state for intersect_subsurface kernel. */
+  const Bssrdf *bssrdf = (const Bssrdf *)sc;
+
+  /* Setup ray into surface. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = sd->N;
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_zero_compact();
+
+  /* Pass along object info, reusing isect to save memory. */
+  INTEGRATOR_STATE_WRITE(isect, Ng) = sd->Ng;
+  INTEGRATOR_STATE_WRITE(isect, object) = sd->object;
+
+  /* Pass BSSRDF parameters. */
+  const uint32_t path_flag = INTEGRATOR_STATE_WRITE(path, flag);
+  INTEGRATOR_STATE_WRITE(path, flag) = (path_flag & ~PATH_RAY_CAMERA) | PATH_RAY_SUBSURFACE;
+  INTEGRATOR_STATE_WRITE(path, throughput) *= shader_bssrdf_sample_weight(sd, sc);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+    }
+  }
+
+  INTEGRATOR_STATE_WRITE(subsurface, albedo) = bssrdf->albedo;
+  INTEGRATOR_STATE_WRITE(subsurface, radius) = bssrdf->radius;
+  INTEGRATOR_STATE_WRITE(subsurface, roughness) = bssrdf->roughness;
+  INTEGRATOR_STATE_WRITE(subsurface, anisotropy) = bssrdf->anisotropy;
+
+  return LABEL_SUBSURFACE_SCATTER;
+}
+
+ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *sd)
+{
+  /* Get bump mapped normal from shader evaluation at exit point. */
+  float3 N = sd->N;
+  if (sd->flag & SD_HAS_BSSRDF_BUMP) {
+    N = shader_bssrdf_normal(sd);
+  }
+
+  /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
+  sd->flag &= ~SD_CLOSURE_FLAGS;
+  sd->num_closure = 0;
+  sd->num_closure_left = kernel_data.max_closures;
+
+  const float3 weight = one_float3();
+  const float roughness = INTEGRATOR_STATE(subsurface, roughness);
+
+#  ifdef __PRINCIPLED__
+  if (roughness != FLT_MAX) {
+    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
+        sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      bsdf->roughness = roughness;
+      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular Disney principled diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+    }
+  }
+  else
+#  endif /* __PRINCIPLED__ */
+  {
+    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      sd->flag |= bsdf_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+    }
+  }
+}
+
+/* Random walk subsurface scattering.
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+
+/* Support for anisotropy from:
+ * "Path Traced Subsurface Scattering using Anisotropic Phase Functions
+ * and Non-Exponential Free Flights".
+ * Magnus Wrenninge, Ryusuke Villemin, Christophe Hery.
+ * https://graphics.pixar.com/library/PathTracedSubsurface/ */
+
+ccl_device void subsurface_random_walk_remap(
+    const float albedo, const float d, float g, float *sigma_t, float *alpha)
+{
+  /* Compute attenuation and scattering coefficients from albedo. */
+  const float g2 = g * g;
+  const float g3 = g2 * g;
+  const float g4 = g3 * g;
+  const float g5 = g4 * g;
+  const float g6 = g5 * g;
+  const float g7 = g6 * g;
+
+  const float A = 1.8260523782f + -1.28451056436f * g + -1.79904629312f * g2 +
+                  9.19393289202f * g3 + -22.8215585862f * g4 + 32.0234874259f * g5 +
+                  -23.6264803333f * g6 + 7.21067002658f * g7;
+  const float B = 4.98511194385f +
+                  0.127355959438f *
+                      expf(31.1491581433f * g + -201.847017512f * g2 + 841.576016723f * g3 +
+                           -2018.09288505f * g4 + 2731.71560286f * g5 + -1935.41424244f * g6 +
+                           559.009054474f * g7);
+  const float C = 1.09686102424f + -0.394704063468f * g + 1.05258115941f * g2 +
+                  -8.83963712726f * g3 + 28.8643230661f * g4 + -46.8802913581f * g5 +
+                  38.5402837518f * g6 + -12.7181042538f * g7;
+  const float D = 0.496310210422f + 0.360146581622f * g + -2.15139309747f * g2 +
+                  17.8896899217f * g3 + -55.2984010333f * g4 + 82.065982243f * g5 +
+                  -58.5106008578f * g6 + 15.8478295021f * g7;
+  const float E = 4.23190299701f +
+                  0.00310603949088f *
+                      expf(76.7316253952f * g + -594.356773233f * g2 + 2448.8834203f * g3 +
+                           -5576.68528998f * g4 + 7116.60171912f * g5 + -4763.54467887f * g6 +
+                           1303.5318055f * g7);
+  const float F = 2.40602999408f + -2.51814844609f * g + 9.18494908356f * g2 +
+                  -79.2191708682f * g3 + 259.082868209f * g4 + -403.613804597f * g5 +
+                  302.85712436f * g6 + -87.4370473567f * g7;
+
+  const float blend = powf(albedo, 0.25f);
+
+  *alpha = (1.0f - blend) * A * powf(atanf(B * albedo), C) +
+           blend * D * powf(atanf(E * albedo), F);
+  *alpha = clamp(*alpha, 0.0f, 0.999999f);  // because of numerical precision
+
+  float sigma_t_prime = 1.0f / fmaxf(d, 1e-16f);
+  *sigma_t = sigma_t_prime / (1.0f - g);
+}
+
+ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
+                                                    const float3 radius,
+                                                    const float anisotropy,
+                                                    float3 *sigma_t,
+                                                    float3 *alpha,
+                                                    float3 *throughput)
+{
+  float sigma_t_x, sigma_t_y, sigma_t_z;
+  float alpha_x, alpha_y, alpha_z;
+
+  subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
+  subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
+  subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+
+  /* Throughput already contains closure weight at this point, which includes the
+   * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
+   * which will be added through scattering. */
+  *throughput = safe_divide_color(*throughput, albedo);
+
+  /* With low albedo values (like 0.025) we get diffusion_length 1.0 and
+   * infinite phase functions. To avoid a sharp discontinuity as we go from
+   * such values to 0.0, increase alpha and reduce the throughput to compensate. */
+  const float min_alpha = 0.2f;
+  if (alpha_x < min_alpha) {
+    (*throughput).x *= alpha_x / min_alpha;
+    alpha_x = min_alpha;
+  }
+  if (alpha_y < min_alpha) {
+    (*throughput).y *= alpha_y / min_alpha;
+    alpha_y = min_alpha;
+  }
+  if (alpha_z < min_alpha) {
+    (*throughput).z *= alpha_z / min_alpha;
+    alpha_z = min_alpha;
+  }
+
+  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
+  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
+}
+
+/* References for Dwivedi sampling:
+ *
+ * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
+ * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
+ * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
+ *
+ * [2] "Improving the Dwivedi Sampling Scheme"
+ * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
+ * https://cg.ivd.kit.edu/1951.php
+ *
+ * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
+ * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
+ * https://iliyan.com/publications/RenderingCourse2020
+ */
+
+ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
+{
+  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
+  return 1.0f / ((v - cos_theta) * phase_log);
+}
+
+ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
+{
+  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
+   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
+   * we can implement the power function like this. */
+  return v - (v + 1.0f) * expf(-rand * phase_log);
+}
+
+ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
+{
+  /* Eq. 67 from [3] */
+  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
+}
+
+ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
+{
+  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+  float phi = M_2PI_F * randv;
+  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
+
+  float3 T, B;
+  make_orthonormals(D, &T, &B);
+  return dir.x * T + dir.y * B + dir.z * D;
+}
+
+ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
+                                                         float t,
+                                                         bool hit,
+                                                         float3 *transmittance)
+{
+  float3 T = volume_color_transmittance(sigma_t, t);
+  if (transmittance) {
+    *transmittance = T;
+  }
+  return hit ? T : sigma_t * T;
+}
+
+/* Define the below variable to get the similarity code active,
+ * and the value represents the cutoff level */
+#  define SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL 9
+
+ccl_device_inline bool subsurface_random_walk(INTEGRATOR_STATE_ARGS,
+                                              RNGState rng_state,
+                                              Ray &ray,
+                                              LocalIntersection &ss_isect)
+{
+  float bssrdf_u, bssrdf_v;
+  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+  const float3 P = INTEGRATOR_STATE(ray, P);
+  const float3 N = INTEGRATOR_STATE(ray, D);
+  const float ray_dP = INTEGRATOR_STATE(ray, dP);
+  const float time = INTEGRATOR_STATE(ray, time);
+  const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+  const int object = INTEGRATOR_STATE(isect, object);
+
+  /* Sample diffuse surface scatter into the object. */
+  float3 D;
+  float pdf;
+  sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+  if (dot(-Ng, D) <= 0.0f) {
+    return false;
+  }
+
+  /* Setup ray. */
+  ray.P = ray_offset(P, -Ng);
+  ray.D = D;
+  ray.t = FLT_MAX;
+  ray.time = time;
+  ray.dP = ray_dP;
+  ray.dD = differential_zero_compact();
+
+#  ifndef __KERNEL_OPTIX__
+  /* Compute or fetch object transforms. */
+  Transform ob_itfm ccl_optional_struct_init;
+  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
+#  endif
+
+  /* Convert subsurface to volume coefficients.
+   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
+  const float3 albedo = INTEGRATOR_STATE(subsurface, albedo);
+  const float3 radius = INTEGRATOR_STATE(subsurface, radius);
+  const float anisotropy = INTEGRATOR_STATE(subsurface, anisotropy);
+
+  float3 sigma_t, alpha;
+  float3 throughput = INTEGRATOR_STATE_WRITE(path, throughput);
+  subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
+  float3 sigma_s = sigma_t * alpha;
+
+  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
+   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
+   * for making the code significantly more complex and slower (if direction sampling depends on
+   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
+   *
+   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
+   * is too low results in fireflies while one that's too high just gives a bit more noise.
+   * Therefore, the code here uses the highest of the three albedos to be safe. */
+  const float diffusion_length = diffusion_length_dwivedi(max3(alpha));
+
+  if (diffusion_length == 1.0f) {
+    /* With specific values of alpha the length might become 1, which in asymptotic makes phase to
+     * be infinite. After first bounce it will cause throughput to be 0. Do early output, avoiding
+     * numerical issues and extra unneeded work. */
+    return false;
+  }
+
+  /* Precompute term for phase sampling. */
+  const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
+
+  /* Modify state for RNGs, decorrelated from other paths. */
+  rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+
+  /* Random walk until we hit the surface again. */
+  bool hit = false;
+  bool have_opposite_interface = false;
+  float opposite_distance = 0.0f;
+
+  /* Todo: Disable for alpha>0.999 or so? */
+  /* Our heuristic, a compromise between guiding and classic. */
+  const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+  float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
+  float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+  float3 sigma_t_org = sigma_t;
+  float3 sigma_s_org = sigma_s;
+  const float anisotropy_org = anisotropy;
+  const float guided_fraction_org = guided_fraction;
+#  endif
+
+  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
+    /* Advance random number offset. */
+    rng_state.rng_offset += PRNG_BOUNCE_NUM;
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+    // shadow with local variables according to depth
+    float anisotropy, guided_fraction;
+    float3 sigma_s, sigma_t;
+    if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
+      anisotropy = anisotropy_org;
+      guided_fraction = guided_fraction_org;
+      sigma_t = sigma_t_org;
+      sigma_s = sigma_s_org;
+    }
+    else {
+      anisotropy = 0.0f;
+      guided_fraction = 0.75f;  // back to isotropic heuristic from Blender
+      sigma_t = sigma_t_star;
+      sigma_s = sigma_s_star;
+    }
+#  endif
+
+    /* Sample color channel, use MIS with balance heuristic. */
+    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
+    float3 channel_pdf;
+    int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
+    float sample_sigma_t = volume_channel_get(sigma_t, channel);
+    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+
+    /* We need the result of the raycast to compute the full guided PDF, so just remember the
+     * relevant terms to avoid recomputing them later. */
+    float backward_fraction = 0.0f;
+    float forward_pdf_factor = 0.0f;
+    float forward_stretching = 1.0f;
+    float backward_pdf_factor = 0.0f;
+    float backward_stretching = 1.0f;
+
+    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
+    if (bounce > 0) {
+      /* Decide whether we should use guided or classic sampling. */
+      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+
+      /* Determine if we want to sample away from the incoming interface.
+       * This only happens if we found a nearby opposite interface, and the probability for it
+       * depends on how close we are to it already.
+       * This probability term comes from the recorded presentation of [3]. */
+      bool guide_backward = false;
+      if (have_opposite_interface) {
+        /* Compute distance of the random walk between the tangent plane at the starting point
+         * and the assumed opposite interface (the parallel plane that contains the point we
+         * found in our ray query for the opposite side). */
+        float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
+        backward_fraction = 1.0f /
+                            (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
+        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+      }
+
+      /* Sample scattering direction. */
+      float scatter_u, scatter_v;
+      path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+      float cos_theta;
+      float hg_pdf;
+      if (guided) {
+        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
+         * sign here is enough to sample from that instead. */
+        if (guide_backward) {
+          cos_theta = -cos_theta;
+        }
+        float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+        hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
+        ray.D = newD;
+      }
+      else {
+        float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+        cos_theta = dot(newD, N);
+        ray.D = newD;
+      }
+
+      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
+       * Since phase sampling is channel-independent, we can get away with applying a factor
+       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
+       * it cancel with an equivalent term in the numerator of the full estimator.
+       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
+       */
+      forward_pdf_factor = M_1_2PI_F * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta) /
+                           hg_pdf;
+      backward_pdf_factor = M_1_2PI_F *
+                            eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta) / hg_pdf;
+
+      /* Prepare distance sampling.
+       * For the backwards case, this also needs the sign swapped since now directions against
+       * sd->N (and therefore with negative cos_theta) are preferred. */
+      forward_stretching = (1.0f - cos_theta / diffusion_length);
+      backward_stretching = (1.0f + cos_theta / diffusion_length);
+      if (guided) {
+        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
+      }
+    }
+
+    /* Sample direction along ray. */
+    float t = -logf(1.0f - randt) / sample_sigma_t;
+
+    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
+     * If yes, we will later use backwards guided sampling in order to have a decent
+     * chance of connecting to it.
+     * Todo: Maybe use less than 10 times the mean free path? */
+    ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
+    scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1);
+    hit = (ss_isect.num_hits > 0);
+
+    if (hit) {
+#  ifdef __KERNEL_OPTIX__
+      /* t is always in world space with OptiX. */
+      ray.t = ss_isect.hits[0].t;
+#  else
+      /* Compute world space distance to surface hit. */
+      float3 D = transform_direction(&ob_itfm, ray.D);
+      D = normalize(D) * ss_isect.hits[0].t;
+      ray.t = len(transform_direction(&ob_tfm, D));
+#  endif
+    }
+
+    if (bounce == 0) {
+      /* Check if we hit the opposite side. */
+      if (hit) {
+        have_opposite_interface = true;
+        opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+      }
+      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
+       * so check if there would have been a hit in that case. */
+      hit = ray.t < t;
+    }
+
+    /* Use the distance to the exit point for the throughput update if we found one. */
+    if (hit) {
+      t = ray.t;
+    }
+    else if (bounce == 0) {
+      /* Restore original position if nothing was hit after the first bounce,
+       * without the ray_offset() that was added to avoid self-intersection.
+       * Otherwise if that offset is relatively large compared to the scattering
+       * radius, we never go back up high enough to exit the surface. */
+      ray.P = P;
+    }
+
+    /* Advance to new scatter location. */
+    ray.P += t * ray.D;
+
+    float3 transmittance;
+    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+    if (bounce > 0) {
+      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
+      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+
+      if (have_opposite_interface) {
+        /* First step of MIS: Depending on geometry we might have two methods for guided
+         * sampling, so perform MIS between them. */
+        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+        guided_pdf = mix(
+            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
+      }
+      else {
+        /* Just include phase sampling factor otherwise. */
+        guided_pdf *= forward_pdf_factor;
+      }
+
+      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
+      pdf = mix(pdf, guided_pdf, guided_fraction);
+    }
+
+    /* Finally, we're applying MIS again to combine the three color channels.
+     * Altogether, the MIS computation combines up to nine different estimators:
+     * {classic, guided, backward_guided} x {r, g, b} */
+    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
+
+    if (hit) {
+      /* If we hit the surface, we are done. */
+      break;
+    }
+    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+      /* Avoid unnecessary work and precision issue when throughput gets really small. */
+      break;
+    }
+  }
+
+  if (hit) {
+    kernel_assert(isfinite3_safe(throughput));
+    INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+  }
+
+  return hit;
+}
+
+ccl_device_inline bool subsurface_scatter(INTEGRATOR_STATE_ARGS)
+{
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  Ray ray ccl_optional_struct_init;
+  LocalIntersection ss_isect ccl_optional_struct_init;
+
+  if (!subsurface_random_walk(INTEGRATOR_STATE_PASS, rng_state, ray, ss_isect)) {
+    return false;
+  }
+
+#  ifdef __VOLUME__
+  /* Update volume stack if needed. */
+  if (kernel_data.integrator.use_volumes) {
+    const int object = intersection_get_object(kg, &ss_isect.hits[0]);
+    const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+    if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
+      float3 P = INTEGRATOR_STATE(ray, P);
+      const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+      const float3 offset_P = ray_offset(P, -Ng);
+
+      integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_PASS, offset_P, ray.P);
+    }
+  }
+#  endif /* __VOLUME__ */
+
+  /* Pretend ray is coming from the outside towards the exit point. This ensures
+   * correct front/back facing normals.
+   * TODO: find a more elegant solution? */
+  ray.P += ray.D * ray.t * 2.0f;
+  ray.D = -ray.D;
+
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &ss_isect.hits[0]);
+  integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Advanced random number offset for bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
+
+  const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
+  const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                shader);
+  }
+
+  return true;
+}
+
+#endif /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
new file mode 100644
index 00000000000..d53070095f0
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Volume Stack
+ *
+ * This is an array of object/shared ID's that the current segment of the path
+ * is inside of. */
+
+template<typename StackReadOp, typename StackWriteOp>
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
+                                        const ShaderData *sd,
+                                        StackReadOp stack_read,
+                                        StackWriteOp stack_write)
+{
+  /* todo: we should have some way for objects to indicate if they want the
+   * world shader to work inside them. excluding it by default is problematic
+   * because non-volume objects can't be assumed to be closed manifolds */
+  if (!(sd->flag & SD_HAS_VOLUME)) {
+    return;
+  }
+
+  if (sd->flag & SD_BACKFACING) {
+    /* Exit volume object: remove from stack. */
+    for (int i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      if (entry.object == sd->object) {
+        /* Shift back next stack entries. */
+        do {
+          entry = stack_read(i + 1);
+          stack_write(i, entry);
+          i++;
+        } while (entry.shader != SHADER_NONE);
+
+        return;
+      }
+    }
+  }
+  else {
+    /* Enter volume object: add to stack. */
+    int i;
+    for (i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      /* Already in the stack? then we have nothing to do. */
+      if (entry.object == sd->object) {
+        return;
+      }
+    }
+
+    /* If we exceed the stack limit, ignore. */
+    if (i >= VOLUME_STACK_SIZE - 1) {
+      return;
+    }
+
+    /* Add to the end of the stack. */
+    const VolumeStack new_entry = {sd->object, sd->shader};
+    const VolumeStack empty_entry = {OBJECT_NONE, SHADER_NONE};
+    stack_write(i, new_entry);
+    stack_write(i + 1, empty_entry);
+  }
+}
+
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+ccl_device void shadow_volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) {
+        return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+      },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void volume_stack_clean(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    /* Keep the world's volume in stack. */
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
+  }
+  else {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = SHADER_NONE;
+  }
+}
+
+template<typename StackReadOp>
+ccl_device float volume_stack_step_size(INTEGRATOR_STATE_ARGS, StackReadOp stack_read)
+{
+  float step_size = FLT_MAX;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    bool heterogeneous = false;
+
+    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
+      heterogeneous = true;
+    }
+    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
+      /* We want to render world or objects without any volume grids
+       * as homogeneous, but can only verify this at run-time since other
+       * heterogeneous volume objects may be using the same shader. */
+      int object = entry.object;
+      if (object != OBJECT_NONE) {
+        int object_flag = kernel_tex_fetch(__object_flag, object);
+        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
+          heterogeneous = true;
+        }
+      }
+    }
+
+    if (heterogeneous) {
+      float object_step_size = object_volume_step_size(kg, entry.object);
+      object_step_size *= kernel_data.integrator.volume_step_rate;
+      step_size = fminf(object_step_size, step_size);
+    }
+  }
+
+  return step_size;
+}
+
+typedef enum VolumeSampleMethod {
+  VOLUME_SAMPLE_NONE = 0,
+  VOLUME_SAMPLE_DISTANCE = (1 << 0),
+  VOLUME_SAMPLE_EQUIANGULAR = (1 << 1),
+  VOLUME_SAMPLE_MIS = (VOLUME_SAMPLE_DISTANCE | VOLUME_SAMPLE_EQUIANGULAR),
+} VolumeSampleMethod;
+
+ccl_device VolumeSampleMethod volume_stack_sample_method(INTEGRATOR_STATE_ARGS)
+{
+  VolumeSampleMethod method = VOLUME_SAMPLE_NONE;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    if (shader_flag & SD_VOLUME_MIS) {
+      /* Multiple importance sampling. */
+      return VOLUME_SAMPLE_MIS;
+    }
+    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_DISTANCE) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Only equiangular sampling. */
+      method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+    else {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_EQUIANGULAR) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Distance sampling only. */
+      method = VOLUME_SAMPLE_DISTANCE;
+    }
+  }
+
+  return method;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 61653d328f1..9e12d24dcf4 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -14,751 +14,501 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_adaptive_sampling.h"
+#include "kernel_random.h"
+#include "kernel_shadow_catcher.h"
+#include "kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* BSDF Eval
+/* --------------------------------------------------------------------
+ * BSDF Evaluation
  *
- * BSDF evaluation result, split per BSDF type. This is used to accumulate
- * render passes separately. */
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd);
+ * BSDF evaluation result, split between diffuse and glossy. This is used to
+ * accumulate render passes separately. Note that reflection, transmission
+ * and volume scattering are written to different render passes, but we assume
+ * that only one of those can happen at a bounce, and so do not need to accumulate
+ * them separately. */
 
-ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
-                                      ClosureType type,
-                                      float3 value,
-                                      int use_light_pass)
+ccl_device_inline void bsdf_eval_init(BsdfEval *eval, const bool is_diffuse, float3 value)
 {
-#ifdef __PASSES__
-  eval->use_light_pass = use_light_pass;
-
-  if (eval->use_light_pass) {
-    eval->diffuse = zero_float3();
-    eval->glossy = zero_float3();
-    eval->transmission = zero_float3();
-    eval->transparent = zero_float3();
-    eval->volume = zero_float3();
-
-    if (type == CLOSURE_BSDF_TRANSPARENT_ID)
-      eval->transparent = value;
-    else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse = value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy = value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission = value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume = value;
-  }
-  else
-#endif
-  {
+  eval->diffuse = zero_float3();
+  eval->glossy = zero_float3();
+
+  if (is_diffuse) {
     eval->diffuse = value;
   }
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis = zero_float3();
-#endif
+  else {
+    eval->glossy = value;
+  }
 }
 
 ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
-                                       ClosureType type,
+                                       const bool is_diffuse,
                                        float3 value,
                                        float mis_weight)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis += value;
-#endif
   value *= mis_weight;
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse += value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy += value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission += value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume += value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse += value;
-  }
-}
 
-ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
-{
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
-           is_zero(eval->transparent) && is_zero(eval->volume);
+  if (is_diffuse) {
+    eval->diffuse += value;
   }
-  else
-#endif
-  {
-    return is_zero(eval->diffuse);
+  else {
+    eval->glossy += value;
   }
 }
 
-ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
+ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse *= value;
-  }
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
 }
 
 ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-  bsdf_eval_mis(eval, value);
+  eval->diffuse *= value;
+  eval->glossy *= value;
 }
 
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-    eval->diffuse *= value;
-#else
   eval->diffuse *= value;
-#endif
+  eval->glossy *= value;
 }
 
 ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
-  }
-  else
-#endif
-    return eval->diffuse;
+  return eval->diffuse + eval->glossy;
 }
 
-/* Path Radiance
- *
- * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definite directly
- * visible as the first non-transparent hit, while indirectly visible are the
- * bounces after that. */
-
-ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(const BsdfEval *eval)
 {
-  /* clear all */
-#ifdef __PASSES__
-  L->use_light_pass = kernel_data.film.use_light_pass;
-
-  if (kernel_data.film.use_light_pass) {
-    L->indirect = zero_float3();
-    L->direct_emission = zero_float3();
-
-    L->color_diffuse = zero_float3();
-    L->color_glossy = zero_float3();
-    L->color_transmission = zero_float3();
-
-    L->direct_diffuse = zero_float3();
-    L->direct_glossy = zero_float3();
-    L->direct_transmission = zero_float3();
-    L->direct_volume = zero_float3();
-
-    L->indirect_diffuse = zero_float3();
-    L->indirect_glossy = zero_float3();
-    L->indirect_transmission = zero_float3();
-    L->indirect_volume = zero_float3();
-
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-    L->background = zero_float3();
-    L->ao = zero_float3();
-    L->shadow = zero_float3();
-    L->mist = 0.0f;
-
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
-    L->state.direct = zero_float3();
-  }
-  else
-#endif
-  {
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-  }
-
-#ifdef __SHADOW_TRICKS__
-  L->path_total = zero_float3();
-  L->path_total_shaded = zero_float3();
-  L->shadow_background_color = zero_float3();
-  L->shadow_throughput = 0.0f;
-  L->shadow_transparency = 1.0f;
-  L->has_shadow_catcher = 0;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  L->denoising_normal = zero_float3();
-  L->denoising_albedo = zero_float3();
-  L->denoising_depth = 0.0f;
-#endif
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+   * We assume reflection, transmission and volume scatter to be exclusive. */
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
-                                                 PathRadianceState *L_state,
-                                                 ccl_addr_space float3 *throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float bsdf_pdf,
-                                                 int bounce,
-                                                 int bsdf_label)
-{
-  float inverse_pdf = 1.0f / bsdf_pdf;
-
-#ifdef __PASSES__
-  if (kernel_data.film.use_light_pass) {
-    if (bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
-      /* first on directly visible surface */
-      float3 value = *throughput * inverse_pdf;
-
-      L_state->diffuse = bsdf_eval->diffuse * value;
-      L_state->glossy = bsdf_eval->glossy * value;
-      L_state->transmission = bsdf_eval->transmission * value;
-      L_state->volume = bsdf_eval->volume * value;
-
-      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
+/* --------------------------------------------------------------------
+ * Clamping
+ *
+ * Clamping is done on a per-contribution basis so that we can write directly
+ * to render buffers instead of using per-thread memory, and to avoid the
+ * impact of clamping on other contributions. */
 
-      L_state->direct = *throughput;
-    }
-    else {
-      /* transparent bounce before first hit, or indirectly visible through BSDF */
-      float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
-      *throughput *= sum;
-    }
+ccl_device_forceinline void kernel_accum_clamp(const KernelGlobals *kg, float3 *L, int bounce)
+{
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(*L)) {
+    kernel_assert(!"Cycles sample with non-finite value detected");
   }
-  else
 #endif
-  {
-    *throughput *= bsdf_eval->diffuse * inverse_pdf;
-  }
-}
+  /* Make sure all components are finite, allowing the contribution to be usable by adaptive
+   * sampling convergence check, but also to make it so render result never causes issues with
+   * post-processing. */
+  *L = ensure_finite3(*L);
 
 #ifdef __CLAMP_SAMPLE__
-ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce)
-{
   float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
                                kernel_data.integrator.sample_clamp_direct;
   float sum = reduce_add(fabs(*L));
   if (sum > limit) {
     *L *= limit / sum;
   }
+#endif
 }
 
-ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg,
-                                                           float3 *L,
-                                                           float3 *throughput,
-                                                           int bounce)
-{
-  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
-                               kernel_data.integrator.sample_clamp_direct;
+/* --------------------------------------------------------------------
+ * Pass accumulation utilities.
+ */
 
-  float sum = reduce_add(fabs(*L));
-  if (sum > limit) {
-    float clamp_factor = limit / sum;
-    *L *= clamp_factor;
-    *throughput *= clamp_factor;
-  }
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-#endif
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
 
-ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg,
-                                                    PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 value)
+ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer,
+                                          int sample)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    return;
+  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
+    return sample;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0)
-      L->emission += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
-  }
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
 }
 
-ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg,
-                                              PathRadiance *L,
-                                              ccl_addr_space PathState *state,
-                                              float3 throughput,
-                                              float3 alpha,
-                                              float3 bsdf,
-                                              float3 ao)
+ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 contribution,
+                                             ccl_global float *ccl_restrict buffer)
 {
-#ifdef __PASSES__
-  /* Store AO pass. */
-  if (L->use_light_pass && state->bounce == 0) {
-    L->ao += alpha * throughput * ao;
-  }
-#endif
-
-#ifdef __SHADOW_TRICKS__
-  /* For shadow catcher, accumulate ratio. */
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf;
-    L->path_total += light;
-    L->path_total_shaded += ao * light;
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+   * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+   * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+   * tiles. */
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return;
   }
-#endif
-
-  float3 contribution = throughput * bsdf * ao;
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0) {
-      /* Directly visible lighting. */
-      L->direct_diffuse += contribution;
-    }
-    else {
-      /* Indirectly visible lighting after BSDF bounce. */
-      L->indirect += contribution;
-    }
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  const int sample = INTEGRATOR_STATE(path, sample);
+  if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_adaptive_aux_buffer,
+        make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
   }
 }
 
-ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 bsdf)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf;
-#endif
-}
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+#ifdef __SHADOW_CATCHER__
 
-ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
-                                                 PathRadiance *L,
-                                                 ccl_addr_space PathState *state,
-                                                 float3 throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float3 shadow,
-                                                 float shadow_fac,
-                                                 bool is_lamp)
+/* Accumulate contribution to the Shadow Catcher pass.
+ *
+ * Returns truth if the contribution is fully handled here and is not to be added to the other
+ * passes (like combined, adaptive sampling). */
+
+ccl_device bool kernel_accum_shadow_catcher(INTEGRATOR_STATE_CONST_ARGS,
+                                            const float3 contribution,
+                                            ccl_global float *ccl_restrict buffer)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf_eval->sum_no_mis;
-    L->path_total += light;
-    L->path_total_shaded += shadow * light;
-
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
   }
-#endif
 
-  float3 shaded_throughput = throughput * shadow;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    /* Compute the clamping based on the total contribution.
-     * The resulting scale is then be applied to all individual components. */
-    float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval);
-#  ifdef __CLAMP_SAMPLE__
-    path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce);
-#  endif
-
-    if (state->bounce == 0) {
-      /* directly visible lighting */
-      L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
-      L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
-      L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
-      L->direct_volume += shaded_throughput * bsdf_eval->volume;
-
-      if (is_lamp) {
-        L->shadow += shadow * shadow_fac;
-      }
-    }
-    else {
-      /* indirectly visible lighting after BSDF bounce */
-      L->indirect += full_contribution;
-    }
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
   }
-  else
-#endif
-  {
-    float3 contribution = shaded_throughput * bsdf_eval->diffuse;
-    path_radiance_clamp(kg, &contribution, state->bounce);
-    L->emission += contribution;
+
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
-}
 
-ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput,
-                                                       const BsdfEval *bsdf_eval)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf_eval->sum_no_mis;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf_eval;
-#endif
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
-                                                      PathRadiance *L,
-                                                      ccl_addr_space PathState *state,
-                                                      float3 throughput,
-                                                      float3 value)
+ccl_device bool kernel_accum_shadow_catcher_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                        const float3 contribution,
+                                                        const float transparent,
+                                                        ccl_global float *ccl_restrict buffer)
 {
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
 
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * value;
-    L->path_total_shaded += throughput * value * L->shadow_transparency;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    return true;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_shadow_catcher_matte,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
+  }
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
-      L->background += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
+     * calculation and the alpha channel of the pass contains numbers of samples contributed to a
+     * pixel of the pass. */
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
 
-#ifdef __DENOISING_FEATURES__
-  L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
-                         value;
-#endif /* __DENOISING_FEATURES__ */
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_transparent(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput)
+ccl_device void kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_CONST_ARGS,
+                                                             const float transparent,
+                                                             ccl_global float *ccl_restrict buffer)
 {
-  L->transparent += average(throughput);
-}
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_accum_shadowcatcher(PathRadiance *L,
-                                                         float3 throughput,
-                                                         float3 background)
-{
-  L->shadow_throughput += average(throughput);
-  L->shadow_background_color += throughput * background;
-  L->has_shadow_catcher = 1;
-}
-#endif
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  /* this division is a bit ugly, but means we only have to keep track of
-   * only a single throughput further along the path, here we recover just
-   * the indirect path that is not influenced by any particular BSDF type */
-  if (L->use_light_pass) {
-    L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
-    L->direct_diffuse += L->state.diffuse * L->direct_emission;
-    L->direct_glossy += L->state.glossy * L->direct_emission;
-    L->direct_transmission += L->state.transmission * L->direct_emission;
-    L->direct_volume += L->state.volume * L->direct_emission;
-
-    L->indirect = safe_divide_color(L->indirect, L->state.direct);
-    L->indirect_diffuse += L->state.diffuse * L->indirect;
-    L->indirect_glossy += L->state.glossy * L->indirect;
-    L->indirect_transmission += L->state.transmission * L->indirect;
-    L->indirect_volume += L->state.volume * L->indirect;
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
   }
-#endif
 }
 
-ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
+#endif /* __SHADOW_CATCHER__ */
+
+/* --------------------------------------------------------------------
+ * Render passes.
+ */
 
-    L->direct_emission = zero_float3();
-    L->indirect = zero_float3();
+/* Write combined pass. */
+ccl_device_inline void kernel_accum_combined_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                  const float3 contribution,
+                                                  ccl_global float *ccl_restrict buffer)
+{
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher(INTEGRATOR_STATE_PASS, contribution, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, const PathRadiance *L_src)
+/* Write combined pass with transparency. */
+ccl_device_inline void kernel_accum_combined_transparent_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                              const float3 contribution,
+                                                              const float transparent,
+                                                              ccl_global float *ccl_restrict
+                                                                  buffer)
 {
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state = L_src->state;
-
-    L->direct_emission = L_src->direct_emission;
-    L->indirect = L_src->indirect;
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher_transparent(
+          INTEGRATOR_STATE_PASS, contribution, transparent, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_combined,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
-                                                       PathRadiance *L,
-                                                       float3 *L_sum,
-                                                       float *alpha)
+/* Write background or emission to appropriate pass. */
+ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                                float3 contribution,
+                                                                ccl_global float *ccl_restrict
+                                                                    buffer,
+                                                                const int pass)
 {
-  /* Calculate current shadow of the path. */
-  float path_total = average(L->path_total);
-  float shadow;
+  if (!(kernel_data.film.light_pass_flag & PASS_ANY)) {
+    return;
+  }
 
-  if (UNLIKELY(!isfinite_safe(path_total))) {
-#  ifdef __KERNEL_DEBUG_NAN__
-    kernel_assert(!"Non-finite total radiance along the path");
-#  endif
-    shadow = 0.0f;
+#ifdef __PASSES__
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  int pass_offset = PASS_UNUSED;
+
+  /* Denoising albedo. */
+#  ifdef __DENOISING_FEATURES__
+  if (path_flag & PATH_RAY_DENOISING_FEATURES) {
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = denoising_feature_throughput * contribution;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
   }
-  else if (path_total == 0.0f) {
-    shadow = L->shadow_transparency;
+#  endif /* __DENOISING_FEATURES__ */
+
+  if (!(path_flag & PATH_RAY_ANY_PASS)) {
+    /* Directly visible, write to emission or background pass. */
+    pass_offset = pass;
+  }
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);
+
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+    }
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_diffuse_direct :
+                                                          kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    }
   }
-  else {
-    float path_total_shaded = average(L->path_total_shaded);
-    shadow = path_total_shaded / path_total;
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_volume_direct :
+                                                          kernel_data.film.pass_volume_indirect;
   }
 
-  /* Calculate final light sum and transparency for shadow catcher object. */
-  if (kernel_data.background.transparent) {
-    *alpha -= L->shadow_throughput * shadow;
-  }
-  else {
-    L->shadow_background_color *= shadow;
-    *L_sum += L->shadow_background_color;
+  /* Single write call for GPU coherence. */
+  if (pass_offset != PASS_UNUSED) {
+    kernel_write_pass_float3(buffer + pass_offset, contribution);
   }
+#endif /* __PASSES__ */
 }
-#endif
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float *alpha)
+/* Write light contribution to render buffer. */
+ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer)
 {
-  float3 L_sum;
-  /* Light Passes are used */
+  /* The throughput for shadow paths already contains the light shader evaluation. */
+  float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
+
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+
 #ifdef __PASSES__
-  float3 L_direct, L_indirect;
-  if (L->use_light_pass) {
-    path_radiance_sum_indirect(L);
-
-    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
-               L->emission;
-    L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
-                 L->indirect_volume;
-
-    if (!kernel_data.background.transparent)
-      L_direct += L->background;
-
-    L_sum = L_direct + L_indirect;
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-
-    /* Reject invalid value */
-    if (!isfinite_safe(sum)) {
-#  ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
-#  endif
-      L_sum = zero_float3();
-
-      L->direct_diffuse = zero_float3();
-      L->direct_glossy = zero_float3();
-      L->direct_transmission = zero_float3();
-      L->direct_volume = zero_float3();
-
-      L->indirect_diffuse = zero_float3();
-      L->indirect_glossy = zero_float3();
-      L->indirect_transmission = zero_float3();
-      L->indirect_volume = zero_float3();
-
-      L->emission = zero_float3();
+  if (kernel_data.film.light_pass_flag & PASS_ANY) {
+    const int path_flag = INTEGRATOR_STATE(shadow_path, flag);
+    int pass_offset = PASS_UNUSED;
+
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);
+
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+      }
+
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
+      if (pass_offset != PASS_UNUSED) {
+        contribution *= INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+      }
+    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
     }
-  }
 
-  /* No Light Passes */
-  else
-#endif
-  {
-    L_sum = L->emission;
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }
 
-    /* Reject invalid value */
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-    if (!isfinite_safe(sum)) {
-#ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-#endif
-      L_sum = zero_float3();
+    /* Write shadow pass. */
+    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
+        (path_flag & PATH_RAY_CAMERA)) {
+      const float3 unshadowed_throughput = INTEGRATOR_STATE(shadow_path, unshadowed_throughput);
+      const float3 shadowed_throughput = INTEGRATOR_STATE(shadow_path, throughput);
+      const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
+                            kernel_data.film.pass_shadow_scale;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
     }
   }
+#endif
+}
 
-  /* Compute alpha. */
-  *alpha = 1.0f - L->transparent;
+/* Write transparency to render buffer.
+ *
+ * Note that we accumulate transparency = 1 - alpha in the render buffer.
+ * Otherwise we'd have to write alpha on path termination, which happens
+ * in many places. */
+ccl_device_inline void kernel_accum_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                const float transparent,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  /* Add shadow catcher contributions. */
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
   }
-#endif /* __SHADOW_TRICKS__ */
 
-  return L_sum;
+  kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_PASS, transparent, buffer);
 }
 
-ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float3 *noisy,
-                                                     float3 *clean)
+/* Write background contribution to render buffer.
+ *
+ * Includes transparency, matching kernel_accum_transparent. */
+ccl_device_inline void kernel_accum_background(INTEGRATOR_STATE_CONST_ARGS,
+                                               const float3 L,
+                                               const float transparent,
+                                               const bool is_transparent_background_ray,
+                                               ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __PASSES__
-  kernel_assert(L->use_light_pass);
-
-  *clean = L->emission + L->background;
-  *noisy = L->direct_volume + L->indirect_volume;
-
-#  define ADD_COMPONENT(flag, component) \
-    if (kernel_data.film.denoising_flags & flag) \
-      *clean += component; \
-    else \
-      *noisy += component;
-
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-#  undef ADD_COMPONENT
-#else
-  *noisy = L->emission;
-  *clean = zero_float3();
-#endif
+  float3 contribution = INTEGRATOR_STATE(path, throughput) * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    *noisy += L->shadow_background_color;
-  }
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  *noisy = ensure_finite3(*noisy);
-  *clean = ensure_finite3(*clean);
+  if (is_transparent_background_ray) {
+    kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+  }
+  else {
+    kernel_accum_combined_transparent_pass(
+        INTEGRATOR_STATE_PASS, contribution, transparent, buffer);
+  }
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_background);
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
+/* Write emission to render buffer. */
+ccl_device_inline void kernel_accum_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 throughput,
+                                             const float3 L,
+                                             ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __SPLIT_KERNEL__
-#  define safe_float3_add(f, v) \
-    do { \
-      ccl_global float *p = (ccl_global float *)(&(f)); \
-      atomic_add_and_fetch_float(p + 0, (v).x); \
-      atomic_add_and_fetch_float(p + 1, (v).y); \
-      atomic_add_and_fetch_float(p + 2, (v).z); \
-    } while (0)
-#  define safe_float_add(f, v) atomic_add_and_fetch_float(&(f), (v))
-#else
-#  define safe_float3_add(f, v) (f) += (v)
-#  define safe_float_add(f, v) (f) += (v)
-#endif /* __SPLIT_KERNEL__ */
+  float3 contribution = throughput * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __PASSES__
-  safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
-  safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
-  safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
-  safe_float3_add(L->direct_volume, L_sample->direct_volume);
-
-  safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
-  safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
-  safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
-  safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
-
-  safe_float3_add(L->background, L_sample->background);
-  safe_float3_add(L->ao, L_sample->ao);
-  safe_float3_add(L->shadow, L_sample->shadow);
-  safe_float_add(L->mist, L_sample->mist);
-#endif /* __PASSES__ */
-  safe_float3_add(L->emission, L_sample->emission);
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#undef safe_float_add
-#undef safe_float3_add
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_emission);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index 98b7bf7e7dc..2bee12f0473 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -14,226 +14,146 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
-#define __KERNEL_ADAPTIVE_SAMPLING_H__
+#pragma once
+
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+/* Check whether the pixel has converged and should not be sampled anymore. */
 
-ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            int sample)
+ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
+                                                     ccl_global float *render_buffer)
 {
-  /* TODO Stefan: Is this better in linear, sRGB or something else? */
-  float4 I = *((ccl_global float4 *)buffer);
-  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-  /* The per pixel error as seen in section 2.1 of
-   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
-   * A small epsilon is added to the divisor to prevent division by zero. */
-  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
-                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
-  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
-    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
-    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return true;
   }
-}
-
-/* Adjust the values of an adaptively sampled pixel. */
-
-ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            float sample_multiplier)
-{
-  *(ccl_global float4 *)(buffer) *= sample_multiplier;
 
-  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
-  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
-  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
 
-#ifdef __PASSES__
-  int flag = kernel_data.film.pass_flag;
-
-  if (flag & PASSMASK(NORMAL))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  return buffer[aux_w_offset] == 0.0f;
+}
 
-  if (flag & PASSMASK(UV))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
 
-  if (flag & PASSMASK(MOTION)) {
-    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
-    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int y,
+                                                           float threshold,
+                                                           bool reset,
+                                                           int offset,
+                                                           int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int render_pixel_index = offset + x + y * stride;
+  ccl_global float *buffer = render_buffer +
+                             (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+  /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+  const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  if (!reset && A.w != 0.0f) {
+    /* If the pixel was considered converged, its state will not change in this kernmel. Early
+     * output before doing any math.
+     *
+     * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+    return true;
   }
 
-  if (kernel_data.film.use_light_pass) {
-    int light_flag = kernel_data.film.light_pass_flag;
-
-    if (light_flag & PASSMASK(MIST))
-      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
-
-    /* Shadow pass omitted on purpose. It has its own scale parameter. */
-
-    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(EMISSION))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
-    if (light_flag & PASSMASK(BACKGROUND))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
-    if (light_flag & PASSMASK(AO))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(DIFFUSE_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
-  }
-#endif
-
-#ifdef __DENOISING_FEATURES__
-
-#  define scale_float3_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale; \
-    *(buffer + offset + 3) *= scale * scale; \
-    *(buffer + offset + 4) *= scale * scale; \
-    *(buffer + offset + 5) *= scale * scale;
-
-#  define scale_shadow_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale * scale;
-
-  if (kernel_data.film.pass_denoising_data) {
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
-    if (kernel_data.film.pass_denoising_clean) {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
-    }
-    else {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-    }
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
-      1) *= sample_multiplier * sample_multiplier;
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Cryptomatte. */
-  if (kernel_data.film.cryptomatte_passes) {
-    int num_slots = 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
-    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
-    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
-                                                         kernel_data.film.pass_cryptomatte);
-    for (int slot = 0; slot < num_slots; slot++) {
-      id_buffer[slot].y *= sample_multiplier;
-    }
-  }
+  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
 
-  /* AOVs. */
-  for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
-    *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
-  }
-  for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
-    *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
-  }
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+  const float inv_sample = 1.0f / sample;
+
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+                                 inv_sample;
+  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+  /* A small epsilon is added to the divisor to prevent division by zero. */
+  const float error = error_difference / (0.0001f + error_normalize);
+  const bool did_converge = (error < threshold);
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  buffer[aux_w_offset] = did_converge;
+
+  return did_converge;
 }
 
 /* This is a simple box filter in two passes.
  * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
 
-ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int y,
+                                                  int start_x,
+                                                  int width,
+                                                  int offset,
+                                                  int stride)
 {
-  bool any = false;
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  for (int x = tile->x; x < tile->x + tile->w; ++x) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (x > tile->x && !prev) {
+  for (int x = start_x; x < start_x + width; ++x) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (x > start_x && !prev) {
         index = index - 1;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
-ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int x,
+                                                  int start_y,
+                                                  int height,
+                                                  int offset,
+                                                  int stride)
 {
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  bool any = false;
-  for (int y = tile->y; y < tile->y + tile->h; ++y) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (y > tile->y && !prev) {
-        index = index - tile->stride;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+  for (int y = start_y; y < start_y + height; ++y) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (y > start_y && !prev) {
+        index = index - stride;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 7da890b908d..e025bcd6674 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -14,502 +14,62 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BAKING__
-
-ccl_device_noinline void compute_light_pass(
-    KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
-{
-  kernel_assert(kernel_data.film.use_light_pass);
-
-  float3 throughput = one_float3();
-
-  /* Emission and indirect shader data memory used by various functions. */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  /* Init radiance. */
-  path_radiance_init(kg, L);
-
-  /* Init path state. */
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
-
-  /* Evaluate surface shader. */
-  shader_eval_surface(kg, sd, &state, NULL, state.flag);
-
-  /* TODO: disable more closures we don't need besides transparent. */
-  shader_bsdf_disable_transparency(kg, sd);
-
-  /* Init ray. */
-  Ray ray;
-  ray.P = sd->P + sd->Ng;
-  ray.D = -sd->Ng;
-  ray.t = FLT_MAX;
-#  ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#  endif
-
-#  ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched) {
-    /* regular path tracer */
-#  endif
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-    bool is_sss_sample = false;
-
-#  ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      SubsurfaceIndirectRays ss_indirect;
-      kernel_path_subsurface_init_indirect(&ss_indirect);
-      if (kernel_path_subsurface_scatter(
-              kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
-        while (ss_indirect.num_rays) {
-          kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
-          kernel_path_indirect(
-              kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-        }
-        is_sss_sample = true;
-      }
-    }
-#  endif
-
-    /* sample light and BSDF */
-    if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-      kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
-
-      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
-#  ifdef __LAMP_MIS__
-        state.ray_t = 0.0f;
-#  endif
-        /* compute indirect light */
-        kernel_path_indirect(
-            kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-
-        /* sum and reset indirect light pass variables for the next samples */
-        path_radiance_sum_indirect(L);
-        path_radiance_reset_indirect(L);
-      }
-    }
-#  ifdef __BRANCHED_PATH__
-  }
-  else {
-    /* branched path tracer */
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-#    ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      kernel_branched_path_subsurface_scatter(
-          kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-    }
-#    endif
-
-    /* sample light and BSDF */
-    if (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT)) {
-#    if defined(__EMISSION__)
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = kernel_data.integrator.sample_all_lights_direct;
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
-      }
-#    endif
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
-    }
-  }
-#  endif
-}
-
-/* this helps with AA but it's not the real solution as it does not AA the geometry
- *  but it's better than nothing, thus committed */
-ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
-{
-  /* use mirror repeat (like opengl texture) so that if the barycentric
-   * coordinate goes past the end of the triangle it is not always clamped
-   * to the same value, gives ugly patterns */
-  u /= max;
-  float fu = floorf(u);
-  u = u - fu;
-
-  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
-}
-
-ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
-                                                 ShaderData *sd,
-                                                 const ShaderEvalType type)
-{
-  switch (type) {
-    case SHADER_EVAL_DIFFUSE:
-      return shader_bsdf_diffuse(kg, sd);
-    case SHADER_EVAL_GLOSSY:
-      return shader_bsdf_glossy(kg, sd);
-    case SHADER_EVAL_TRANSMISSION:
-      return shader_bsdf_transmission(kg, sd);
-    default:
-      kernel_assert(!"Unknown bake type passed to BSDF evaluate");
-      return zero_float3();
-  }
-}
-
-ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       PathState *state,
-                                                       float3 direct,
-                                                       float3 indirect,
-                                                       const ShaderEvalType type,
-                                                       const int pass_filter)
-{
-  float3 color;
-  const bool is_color = (pass_filter & BAKE_FILTER_COLOR) != 0;
-  const bool is_direct = (pass_filter & BAKE_FILTER_DIRECT) != 0;
-  const bool is_indirect = (pass_filter & BAKE_FILTER_INDIRECT) != 0;
-  float3 out = zero_float3();
-
-  if (is_color) {
-    if (is_direct || is_indirect) {
-      /* Leave direct and diffuse channel colored. */
-      color = one_float3();
-    }
-    else {
-      /* surface color of the pass only */
-      shader_eval_surface(kg, sd, state, NULL, 0);
-      return kernel_bake_shader_bsdf(kg, sd, type);
-    }
-  }
-  else {
-    shader_eval_surface(kg, sd, state, NULL, 0);
-    color = kernel_bake_shader_bsdf(kg, sd, type);
-  }
-
-  if (is_direct) {
-    out += safe_divide_even_color(direct, color);
-  }
-
-  if (is_indirect) {
-    out += safe_divide_even_color(indirect, color);
-  }
-
-  return out;
-}
-
-ccl_device void kernel_bake_evaluate(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* Setup render buffers. */
-  const int index = offset + x + y * stride;
-  const int pass_stride = kernel_data.film.pass_stride;
-  buffer += index * pass_stride;
-
-  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
-  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
-  ccl_global float *output = buffer + kernel_data.film.pass_combined;
-
-  int seed = __float_as_uint(primitive[0]);
-  int prim = __float_as_uint(primitive[1]);
-  if (prim == -1)
-    return;
-
-  prim += kernel_data.bake.tri_offset;
-
-  /* Random number generator. */
-  uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  float filter_x, filter_y;
-  if (sample == 0) {
-    filter_x = filter_y = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
-  }
-
-  /* Barycentric UV with sub-pixel offset. */
-  float u = primitive[2];
-  float v = primitive[3];
-
-  float dudx = differential[0];
-  float dudy = differential[1];
-  float dvdx = differential[2];
-  float dvdy = differential[3];
-
-  if (sample > 0) {
-    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
-    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
-                                 1.0f - u);
-  }
-
-  /* Shader data setup. */
-  int object = kernel_data.bake.object_index;
-  int shader;
-  float3 P, Ng;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  ShaderData sd;
-  shader_setup_from_sample(
-      kg,
-      &sd,
-      P,
-      Ng,
-      Ng,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      1.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-  sd.I = sd.N;
-
-  /* Setup differentials. */
-  sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
-  sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
-  sd.du.dx = dudx;
-  sd.du.dy = dudy;
-  sd.dv.dx = dvdx;
-  sd.dv.dy = dvdy;
-
-  /* Set RNG state for shaders that use sampling. */
-  PathState state = {0};
-  state.rng_hash = rng_hash;
-  state.rng_offset = 0;
-  state.sample = sample;
-  state.num_samples = num_samples;
-  state.min_ray_pdf = FLT_MAX;
-
-  /* Light passes if we need more than color. */
-  PathRadiance L;
-  int pass_filter = kernel_data.bake.pass_filter;
-
-  if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
-    compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
-
-  float3 out = zero_float3();
-
-  ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
-  switch (type) {
-    /* data passes */
-    case SHADER_EVAL_NORMAL:
-    case SHADER_EVAL_ROUGHNESS:
-    case SHADER_EVAL_EMISSION: {
-      if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
-        int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
-        shader_eval_surface(kg, &sd, &state, NULL, path_flag);
-      }
-
-      if (type == SHADER_EVAL_NORMAL) {
-        float3 N = sd.N;
-        if (sd.flag & SD_HAS_BUMP) {
-          N = shader_bsdf_average_normal(kg, &sd);
-        }
+#pragma once
 
-        /* encoding: normal = (2 * color) - 1 */
-        out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
-      }
-      else if (type == SHADER_EVAL_ROUGHNESS) {
-        float roughness = shader_bsdf_average_roughness(&sd);
-        out = make_float3(roughness, roughness, roughness);
-      }
-      else {
-        out = shader_emissive_eval(&sd);
-      }
-      break;
-    }
-    case SHADER_EVAL_UV: {
-      out = primitive_uv(kg, &sd);
-      break;
-    }
-#  ifdef __PASSES__
-    /* light passes */
-    case SHADER_EVAL_AO: {
-      out = L.ao;
-      break;
-    }
-    case SHADER_EVAL_COMBINED: {
-      if ((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-        float alpha;
-        out = path_radiance_clamp_and_sum(kg, &L, &alpha);
-        break;
-      }
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
 
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_DIRECT) == BAKE_FILTER_DIFFUSE_DIRECT)
-        out += L.direct_diffuse;
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_INDIRECT) == BAKE_FILTER_DIFFUSE_INDIRECT)
-        out += L.indirect_diffuse;
+#include "kernel/geom/geom.h"
 
-      if ((pass_filter & BAKE_FILTER_GLOSSY_DIRECT) == BAKE_FILTER_GLOSSY_DIRECT)
-        out += L.direct_glossy;
-      if ((pass_filter & BAKE_FILTER_GLOSSY_INDIRECT) == BAKE_FILTER_GLOSSY_INDIRECT)
-        out += L.indirect_glossy;
-
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_DIRECT) == BAKE_FILTER_TRANSMISSION_DIRECT)
-        out += L.direct_transmission;
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
-        out += L.indirect_transmission;
-
-      if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
-        out += L.emission;
-
-      break;
-    }
-    case SHADER_EVAL_SHADOW: {
-      out = L.shadow;
-      break;
-    }
-    case SHADER_EVAL_DIFFUSE: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_diffuse, L.indirect_diffuse, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_GLOSSY: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_glossy, L.indirect_glossy, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_TRANSMISSION: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
-      break;
-    }
-#  endif
-
-    /* extra */
-    case SHADER_EVAL_ENVIRONMENT: {
-      /* setup ray */
-      Ray ray;
-
-      ray.P = zero_float3();
-      ray.D = normalize(P);
-      ray.t = 0.0f;
-#  ifdef __CAMERA_MOTION__
-      ray.time = 0.5f;
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-      ray.dD = differential3_zero();
-      ray.dP = differential3_zero();
-#  endif
-
-      /* setup shader data */
-      shader_setup_from_background(kg, &sd, &ray);
-
-      /* evaluate */
-      int path_flag = 0; /* we can't know which type of BSDF this is for */
-      shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-      out = shader_background_eval(&sd);
-      break;
-    }
-    default: {
-      /* no real shader, returning the position of the verts for debugging */
-      out = normalize(P);
-      break;
-    }
-  }
-
-  /* write output */
-  const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
-  kernel_write_pass_float4(output, result);
-}
-
-#endif /* __BAKING__ */
+CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
-                                         ccl_global uint4 *input,
+ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
+                                         ccl_global const KernelShaderEvalInput *input,
                                          ccl_global float4 *output,
-                                         int i)
+                                         const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
+  /* Setup shader data. */
+  const KernelShaderEvalInput in = input[offset];
 
-  /* setup shader data */
-  int object = in.x;
-  int prim = in.y;
-  float u = __uint_as_float(in.z);
-  float v = __uint_as_float(in.w);
-
-  shader_setup_from_displace(kg, &sd, object, prim, u, v);
+  ShaderData sd;
+  shader_setup_from_displace(kg, &sd, in.object, in.prim, in.u, in.v);
 
-  /* evaluate */
-  float3 P = sd.P;
-  shader_eval_displacement(kg, &sd, &state);
+  /* Evaluate displacement shader. */
+  const float3 P = sd.P;
+  shader_eval_displacement(INTEGRATOR_STATE_PASS_NULL, &sd);
   float3 D = sd.P - P;
 
   object_inverse_dir_transform(kg, &sd, &D);
 
-  /* write output */
-  output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
 }
 
-ccl_device void kernel_background_evaluate(KernelGlobals *kg,
-                                           ccl_global uint4 *input,
+ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
+                                           ccl_global const KernelShaderEvalInput *input,
                                            ccl_global float4 *output,
-                                           int i)
+                                           const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
-
-  /* setup ray */
-  Ray ray;
-  float u = __uint_as_float(in.x);
-  float v = __uint_as_float(in.y);
-
-  ray.P = zero_float3();
-  ray.D = equirectangular_to_direction(u, v);
-  ray.t = 0.0f;
-#ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#endif
+  /* Setup ray */
+  const KernelShaderEvalInput in = input[offset];
+  const float3 ray_P = zero_float3();
+  const float3 ray_D = equirectangular_to_direction(in.u, in.v);
+  const float ray_time = 0.5f;
 
-#ifdef __RAY_DIFFERENTIALS__
-  ray.dD = differential3_zero();
-  ray.dP = differential3_zero();
-#endif
-
-  /* setup shader data */
-  shader_setup_from_background(kg, &sd, &ray);
+  /* Setup shader data. */
+  ShaderData sd;
+  shader_setup_from_background(kg, &sd, ray_P, ray_D, ray_time);
 
-  /* evaluate */
-  int path_flag = 0; /* we can't know which type of BSDF this is for */
-  shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-  float3 color = shader_background_eval(&sd);
+  /* Evaluate shader.
+   * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */
+  const int path_flag = PATH_RAY_EMISSION;
+  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+      INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
+  const float3 color = shader_background_eval(&sd);
 
-  /* write output */
-  output[i] += make_float4(color.x, color.y, color.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1bfac37158d..7be5da8fe6d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_differential.h"
+#include "kernel_lookup_table.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Perspective Camera */
@@ -39,7 +46,7 @@ ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u
   return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg,
+ccl_device void camera_sample_perspective(const KernelGlobals *ccl_restrict kg,
                                           float raster_x,
                                           float raster_y,
                                           float lens_u,
@@ -113,10 +120,14 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     float3 Dcenter = transform_direction(&cameratoworld, Pcamera);
-
-    ray->dP = differential3_zero();
-    ray->dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - normalize(Dcenter);
-    ray->dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - normalize(Dcenter);
+    float3 Dcenter_normalized = normalize(Dcenter);
+
+    /* TODO: can this be optimized to give compact differentials directly? */
+    ray->dP = differential_zero_compact();
+    differential3 dD;
+    dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - Dcenter_normalized;
+    dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - Dcenter_normalized;
+    ray->dD = differential_make_compact(dD);
 #endif
   }
   else {
@@ -143,8 +154,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dx = normalize(transform_direction(&cameratoworld, Dx));
     spherical_stereo_transform(&kernel_data.cam, &Px, &Dx);
 
-    ray->dP.dx = Px - Pcenter;
-    ray->dD.dx = Dx - Dcenter;
+    differential3 dP, dD;
+
+    dP.dx = Px - Pcenter;
+    dD.dx = Dx - Dcenter;
 
     float3 Py = Pnostereo;
     float3 Dy = transform_perspective(&rastertocamera,
@@ -152,8 +165,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dy = normalize(transform_direction(&cameratoworld, Dy));
     spherical_stereo_transform(&kernel_data.cam, &Py, &Dy);
 
-    ray->dP.dy = Py - Pcenter;
-    ray->dD.dy = Dy - Dcenter;
+    dP.dy = Py - Pcenter;
+    dD.dy = Dy - Dcenter;
+    ray->dD = differential_make_compact(dD);
+    ray->dP = differential_make_compact(dP);
 #endif
   }
 
@@ -162,8 +177,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
   float z_inv = 1.0f / normalize(Pcamera).z;
   float nearclip = kernel_data.cam.nearclip * z_inv;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = kernel_data.cam.cliplength * z_inv;
 #else
   ray->t = FLT_MAX;
@@ -171,7 +185,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 }
 
 /* Orthographic Camera */
-ccl_device void camera_sample_orthographic(KernelGlobals *kg,
+ccl_device void camera_sample_orthographic(const KernelGlobals *ccl_restrict kg,
                                            float raster_x,
                                            float raster_y,
                                            float lens_u,
@@ -220,10 +234,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* ray differential */
-  ray->dP.dx = float4_to_float3(kernel_data.cam.dx);
-  ray->dP.dy = float4_to_float3(kernel_data.cam.dy);
+  differential3 dP;
+  dP.dx = float4_to_float3(kernel_data.cam.dx);
+  dP.dy = float4_to_float3(kernel_data.cam.dx);
 
-  ray->dD = differential3_zero();
+  ray->dP = differential_make_compact(dP);
+  ray->dD = differential_zero_compact();
 #endif
 
 #ifdef __CAMERA_CLIPPING__
@@ -323,8 +339,9 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Px, &Dx);
   }
 
-  ray->dP.dx = Px - Pcenter;
-  ray->dD.dx = Dx - Dcenter;
+  differential3 dP, dD;
+  dP.dx = Px - Pcenter;
+  dD.dx = Dx - Dcenter;
 
   float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
   float3 Dy = panorama_to_direction(cam, Py.x, Py.y);
@@ -334,16 +351,17 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Py, &Dy);
   }
 
-  ray->dP.dy = Py - Pcenter;
-  ray->dD.dy = Dy - Dcenter;
+  dP.dy = Py - Pcenter;
+  dD.dy = Dy - Dcenter;
+  ray->dD = differential_make_compact(dD);
+  ray->dP = differential_make_compact(dP);
 #endif
 
 #ifdef __CAMERA_CLIPPING__
   /* clipping */
   float nearclip = cam->nearclip;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = cam->cliplength;
 #else
   ray->t = FLT_MAX;
@@ -352,7 +370,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
 
 /* Common */
 
-ccl_device_inline void camera_sample(KernelGlobals *kg,
+ccl_device_inline void camera_sample(const KernelGlobals *ccl_restrict kg,
                                      int x,
                                      int y,
                                      float filter_u,
@@ -426,13 +444,13 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
 
 /* Utilities */
 
-ccl_device_inline float3 camera_position(KernelGlobals *kg)
+ccl_device_inline float3 camera_position(const KernelGlobals *kg)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
 }
 
-ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_distance(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
@@ -446,7 +464,7 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_z_depth(const KernelGlobals *kg, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     Transform worldtocamera = kernel_data.cam.worldtocamera;
@@ -459,7 +477,7 @@ ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+ccl_device_inline float3 camera_direction_from_point(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
 
@@ -473,7 +491,7 @@ ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P
   }
 }
 
-ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
+ccl_device_inline float3 camera_world_to_ndc(const KernelGlobals *kg, ShaderData *sd, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     /* perspective / ortho */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 5eb1bdad02e..960774e0741 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -14,25 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COLOR_H__
-#define __KERNEL_COLOR_H__
+#pragma once
 
 #include "util/util_color.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 xyz_to_rgb(KernelGlobals *kg, float3 xyz)
+ccl_device float3 xyz_to_rgb(const KernelGlobals *kg, float3 xyz)
 {
   return make_float3(dot(float4_to_float3(kernel_data.film.xyz_to_r), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_g), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_b), xyz));
 }
 
-ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
+ccl_device float linear_rgb_to_gray(const KernelGlobals *kg, float3 c)
 {
   return dot(c, float4_to_float3(kernel_data.film.rgb_to_y));
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
deleted file mode 100644
index 4a9304a134c..00000000000
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_COMPAT_OPENCL_H__
-#define __KERNEL_COMPAT_OPENCL_H__
-
-#define __KERNEL_GPU__
-#define __KERNEL_OPENCL__
-
-/* no namespaces in opencl */
-#define CCL_NAMESPACE_BEGIN
-#define CCL_NAMESPACE_END
-
-#ifdef __CL_NOINLINE__
-#  define ccl_noinline __attribute__((noinline))
-#else
-#  define ccl_noinline
-#endif
-
-/* in opencl all functions are device functions, so leave this empty */
-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device ccl_noinline
-#define ccl_device_noinline_cpu ccl_device
-#define ccl_may_alias
-#define ccl_static_constant static __constant
-#define ccl_constant __constant
-#define ccl_global __global
-#define ccl_local __local
-#define ccl_local_param __local
-#define ccl_private __private
-#define ccl_restrict restrict
-#define ccl_ref
-#define ccl_align(n) __attribute__((aligned(n)))
-#define ccl_optional_struct_init
-
-#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION)
-#  define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
-#else
-#  define ccl_loop_no_unroll
-#endif
-
-#ifdef __SPLIT_KERNEL__
-#  define ccl_addr_space __global
-#else
-#  define ccl_addr_space
-#endif
-
-#define ATTR_FALLTHROUGH
-
-#define ccl_local_id(d) get_local_id(d)
-#define ccl_global_id(d) get_global_id(d)
-
-#define ccl_local_size(d) get_local_size(d)
-#define ccl_global_size(d) get_global_size(d)
-
-#define ccl_group_id(d) get_group_id(d)
-#define ccl_num_groups(d) get_num_groups(d)
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
-/* no assert in opencl */
-#define kernel_assert(cond)
-
-/* make_type definitions with opencl style element initializers */
-#ifdef make_float2
-#  undef make_float2
-#endif
-#ifdef make_float3
-#  undef make_float3
-#endif
-#ifdef make_float4
-#  undef make_float4
-#endif
-#ifdef make_int2
-#  undef make_int2
-#endif
-#ifdef make_int3
-#  undef make_int3
-#endif
-#ifdef make_int4
-#  undef make_int4
-#endif
-#ifdef make_uchar4
-#  undef make_uchar4
-#endif
-
-#define make_float2(x, y) ((float2)(x, y))
-#define make_float3(x, y, z) ((float3)(x, y, z))
-#define make_float4(x, y, z, w) ((float4)(x, y, z, w))
-#define make_int2(x, y) ((int2)(x, y))
-#define make_int3(x, y, z) ((int3)(x, y, z))
-#define make_int4(x, y, z, w) ((int4)(x, y, z, w))
-#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
-
-/* math functions */
-#define __uint_as_float(x) as_float(x)
-#define __float_as_uint(x) as_uint(x)
-#define __int_as_float(x) as_float(x)
-#define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)(x)), ((float)(y)))
-#define fabsf(x) fabs(((float)(x)))
-#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
-#define asinf(x) asin(((float)(x)))
-#define acosf(x) acos(((float)(x)))
-#define atanf(x) atan(((float)(x)))
-#define floorf(x) floor(((float)(x)))
-#define ceilf(x) ceil(((float)(x)))
-#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
-#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
-#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
-#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
-#define fmodf(x, y) fmod((float)(x), (float)(y))
-#define sinhf(x) sinh(((float)(x)))
-#define coshf(x) cosh(((float)(x)))
-#define tanhf(x) tanh(((float)(x)))
-
-/* Use native functions with possibly lower precision for performance,
- * no issues found so far. */
-#if 1
-#  define sinf(x) native_sin(((float)(x)))
-#  define cosf(x) native_cos(((float)(x)))
-#  define tanf(x) native_tan(((float)(x)))
-#  define expf(x) native_exp(((float)(x)))
-#  define sqrtf(x) native_sqrt(((float)(x)))
-#  define logf(x) native_log(((float)(x)))
-#  define rcp(x) native_recip(x)
-#else
-#  define sinf(x) sin(((float)(x)))
-#  define cosf(x) cos(((float)(x)))
-#  define tanf(x) tan(((float)(x)))
-#  define expf(x) exp(((float)(x)))
-#  define sqrtf(x) sqrt(((float)(x)))
-#  define logf(x) log(((float)(x)))
-#  define rcp(x) recip(x)
-#endif
-
-/* data lookup defines */
-#define kernel_data (*kg->data)
-#define kernel_tex_array(tex) \
-  ((const ccl_global tex##_t *)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))
-#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)]
-
-/* define NULL */
-#ifndef NULL
-#  define NULL ((void *)0)
-#endif
-
-/* enable extensions */
-#ifdef __KERNEL_CL_KHR_FP16__
-#  pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#include "util/util_half.h"
-#include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index 3ec0cdbaccc..db4e110bd10 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -14,26 +14,28 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(ccl_addr_space differential3 *dP_,
-                                      const differential3 dP,
-                                      float3 D,
-                                      const differential3 dD,
-                                      float3 Ng,
-                                      float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *surface_dP,
+                                      const differential3 ray_dP,
+                                      float3 ray_D,
+                                      const differential3 ray_dD,
+                                      float3 surface_Ng,
+                                      float ray_t)
 {
   /* ray differential transfer through homogeneous medium, to
    * compute dPdx/dy at a shading point from the incoming ray */
 
-  float3 tmp = D / dot(D, Ng);
-  float3 tmpx = dP.dx + t * dD.dx;
-  float3 tmpy = dP.dy + t * dD.dy;
+  float3 tmp = ray_D / dot(ray_D, surface_Ng);
+  float3 tmpx = ray_dP.dx + ray_t * ray_dD.dx;
+  float3 tmpy = ray_dP.dy + ray_t * ray_dD.dy;
 
-  dP_->dx = tmpx - dot(tmpx, Ng) * tmp;
-  dP_->dy = tmpy - dot(tmpy, Ng) * tmp;
+  surface_dP->dx = tmpx - dot(tmpx, surface_Ng) * tmp;
+  surface_dP->dy = tmpy - dot(tmpy, surface_Ng) * tmp;
 }
 
 ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
@@ -112,4 +114,53 @@ ccl_device differential3 differential3_zero()
   return d;
 }
 
+/* Compact ray differentials that are just a scale to reduce memory usage and
+ * access cost in GPU.
+ *
+ * See above for more accurate reference implementations.
+ *
+ * TODO: also store the more compact version in ShaderData and recompute where
+ * needed? */
+
+ccl_device_forceinline float differential_zero_compact()
+{
+  return 0.0f;
+}
+
+ccl_device_forceinline float differential_make_compact(const differential3 D)
+{
+  return 0.5f * (len(D.dx) + len(D.dy));
+}
+
+ccl_device_forceinline void differential_transfer_compact(ccl_addr_space differential3 *surface_dP,
+                                                          const float ray_dP,
+                                                          const float3 /* ray_D */,
+                                                          const float ray_dD,
+                                                          const float3 surface_Ng,
+                                                          const float ray_t)
+{
+  /* ray differential transfer through homogeneous medium, to
+   * compute dPdx/dy at a shading point from the incoming ray */
+  float scale = ray_dP + ray_t * ray_dD;
+
+  float3 dx, dy;
+  make_orthonormals(surface_Ng, &dx, &dy);
+  surface_dP->dx = dx * scale;
+  surface_dP->dy = dy * scale;
+}
+
+ccl_device_forceinline void differential_incoming_compact(ccl_addr_space differential3 *dI,
+                                                          const float3 D,
+                                                          const float dD)
+{
+  /* compute dIdx/dy at a shading point, we just need to negate the
+   * differential of the ray direction */
+
+  float3 dx, dy;
+  make_orthonormals(D, &dx, &dy);
+
+  dI->dx = dD * dx;
+  dI->dy = dD * dy;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index aebf2ec8e28..d62285d173d 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -14,40 +14,36 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* Direction Emission */
-ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    LightSample *ls,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 I,
-                                                    differential3 dI,
-                                                    float t,
-                                                    float time)
+/* Evaluate shader on light. */
+ccl_device_noinline_cpu float3 light_sample_shader_eval(INTEGRATOR_STATE_ARGS,
+                                                        ShaderData *ccl_restrict emission_sd,
+                                                        LightSample *ccl_restrict ls,
+                                                        float time)
 {
   /* setup shading at emitter */
   float3 eval = zero_float3();
 
   if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
-    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
+    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) {
       ls->Ng = -ls->Ng;
     }
   }
   else {
     /* Setup shader data and call shader_eval_surface once, better
      * for GPU coherence and compile times. */
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
 #ifdef __BACKGROUND_MIS__
     if (ls->type == LIGHT_BACKGROUND) {
-      Ray ray;
-      ray.D = ls->D;
-      ray.P = ls->P;
-      ray.t = 1.0f;
-      ray.time = time;
-      ray.dP = differential3_zero();
-      ray.dD = dI;
-
-      shader_setup_from_background(kg, emission_sd, &ray);
+      shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time);
     }
     else
 #endif
@@ -56,13 +52,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
                                emission_sd,
                                ls->P,
                                ls->Ng,
-                               I,
+                               -ls->D,
                                ls->shader,
                                ls->object,
                                ls->prim,
                                ls->u,
                                ls->v,
-                               t,
+                               ls->t,
                                time,
                                false,
                                ls->lamp);
@@ -70,11 +66,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
       ls->Ng = emission_sd->Ng;
     }
 
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+
     /* No proper path flag, we're evaluating this for all closures. that's
      * weak but we'd have to do multiple evaluations otherwise. */
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, NULL, PATH_RAY_EMISSION);
 
     /* Evaluate closures. */
 #ifdef __BACKGROUND_MIS__
@@ -98,85 +96,129 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
   return eval;
 }
 
-ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
-                                             ShaderData *sd,
-                                             ShaderData *emission_sd,
-                                             LightSample *ls,
-                                             ccl_addr_space PathState *state,
-                                             Ray *ray,
-                                             BsdfEval *eval,
-                                             bool *is_lamp,
-                                             float rand_terminate)
+/* Test if light sample is from a light or emission from geometry. */
+ccl_device_inline bool light_sample_is_light(const LightSample *ccl_restrict ls)
 {
-  if (ls->pdf == 0.0f)
-    return false;
-
-  /* todo: implement */
-  differential3 dD = differential3_zero();
+  /* return if it's a lamp for shadow pass */
+  return (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
+}
 
-  /* evaluate closure */
+/* Early path termination of shadow rays. */
+ccl_device_inline bool light_sample_terminate(const KernelGlobals *ccl_restrict kg,
+                                              const LightSample *ccl_restrict ls,
+                                              BsdfEval *ccl_restrict eval,
+                                              const float rand_terminate)
+{
+  if (bsdf_eval_is_zero(eval)) {
+    return true;
+  }
 
-  float3 light_eval = direct_emissive_eval(
-      kg, emission_sd, ls, state, -ls->D, dD, ls->t, sd->time);
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    float probability = max3(fabs(bsdf_eval_sum(eval))) *
+                        kernel_data.integrator.light_inv_rr_threshold;
+    if (probability < 1.0f) {
+      if (rand_terminate >= probability) {
+        return true;
+      }
+      bsdf_eval_mul(eval, 1.0f / probability);
+    }
+  }
 
-  if (is_zero(light_eval))
-    return false;
+  return false;
+}
 
-    /* evaluate BSDF at shading point */
+/* This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. The algorithm slightly distorts flat surface
+ * of a triangle. Surface is lifted by amount h along normal n in the incident
+ * point. */
 
-#ifdef __VOLUME__
-  if (sd->prim != PRIM_NONE)
-    shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
+ccl_device_inline float3 shadow_ray_smooth_surface_offset(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          float3 Ng)
+{
+  float3 V[3], N[3];
+  triangle_vertices_and_normals(kg, sd->prim, V, N);
+
+  const float u = sd->u, v = sd->v;
+  const float w = 1 - u - v;
+  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
+  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
+
+  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+
+  /* Parabolic approximation */
+  float a = dot(N[2] - N[0], V[0] - V[2]);
+  float b = dot(N[2] - N[1], V[1] - V[2]);
+  float c = dot(N[1] - N[0], V[1] - V[0]);
+  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
+
+  /* Check flipped normals */
+  if (dot(n, Ng) > 0) {
+    /* Local linear envelope */
+    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
+    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
+    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
+    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
+    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
+    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
+    h = max(min(min(h0, h1), h2), h * 0.5f);
+  }
   else {
-    float bsdf_pdf;
-    shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
-    if (ls->shader & SHADER_USE_MIS) {
-      /* Multiple importance sampling. */
-      float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
-      light_eval *= mis_weight;
-    }
+    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
+    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
+    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
+    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
+    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
+    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
+    h = min(-min(min(h0, h1), h2), h * 0.5f);
   }
-#else
-  shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
-#endif
 
-  bsdf_eval_mul3(eval, light_eval / ls->pdf);
-
-#ifdef __PASSES__
-  /* use visibility flag to skip lights */
-  if (ls->shader & SHADER_EXCLUDE_ANY) {
-    if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
-      eval->diffuse = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_GLOSSY)
-      eval->glossy = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
-      eval->transmission = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_SCATTER)
-      eval->volume = zero_float3();
-  }
-#endif
+  return n * h;
+}
 
-  if (bsdf_eval_is_zero(eval))
-    return false;
+/* Ray offset to avoid shadow terminator artifact. */
 
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f
-#ifdef __SHADOW_TRICKS__
-      && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
-#endif
-  ) {
-    float probability = max3(fabs(bsdf_eval_sum(eval))) *
-                        kernel_data.integrator.light_inv_rr_threshold;
-    if (probability < 1.0f) {
-      if (rand_terminate >= probability) {
-        return false;
+ccl_device_inline float3 shadow_ray_offset(const KernelGlobals *ccl_restrict kg,
+                                           const ShaderData *ccl_restrict sd,
+                                           float3 L)
+{
+  float NL = dot(sd->N, L);
+  bool transmit = (NL < 0.0f);
+  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
+  float3 P = ray_offset(sd->P, Ng);
+
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+    const float offset_cutoff =
+        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
+    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
+     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
+     * make a smooth transition near the threshold. */
+    if (offset_cutoff > 0.0f) {
+      float NgL = dot(Ng, L);
+      float offset_amount = 0.0f;
+      if (NL < offset_cutoff) {
+        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
+      }
+      else {
+        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
+      }
+      if (offset_amount > 0.0f) {
+        P += shadow_ray_smooth_surface_offset(kg, sd, Ng) * offset_amount;
       }
-      bsdf_eval_mul(eval, 1.0f / probability);
     }
   }
 
+  return P;
+}
+
+ccl_device_inline void shadow_ray_setup(const ShaderData *ccl_restrict sd,
+                                        const LightSample *ccl_restrict ls,
+                                        const float3 P,
+                                        Ray *ray)
+{
   if (ls->shader & SHADER_CAST_SHADOW) {
     /* setup ray */
-    ray->P = ray_offset_shadow(kg, sd, ls->D);
+    ray->P = P;
 
     if (ls->t == FLT_MAX) {
       /* distant light */
@@ -185,160 +227,40 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
     }
     else {
       /* other lights, avoid self-intersection */
-      ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
+      ray->D = ray_offset(ls->P, ls->Ng) - P;
       ray->D = normalize_len(ray->D, &ray->t);
     }
-
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
   }
   else {
     /* signal to not cast shadow ray */
+    ray->P = zero_float3();
+    ray->D = zero_float3();
     ray->t = 0.0f;
   }
 
-  /* return if it's a lamp for shadow pass */
-  *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
-
-  return true;
+  ray->dP = differential_make_compact(sd->dP);
+  ray->dD = differential_zero_compact();
+  ray->time = sd->time;
 }
 
-/* Indirect Primitive Emission */
-
-ccl_device_noinline_cpu float3 indirect_primitive_emission(
-    KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_surface_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const LightSample *ccl_restrict ls,
+                                                          Ray *ray)
 {
-  /* evaluate emissive closure */
-  float3 L = shader_emissive_eval(sd);
-
-#ifdef __HAIR__
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_ALL_TRIANGLE))
-#else
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#endif
-  {
-    /* multiple importance sampling, get triangle light pdf,
-     * and compute weight with respect to BSDF pdf */
-    float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
-    return L * mis_weight;
-  }
-
-  return L;
+  const float3 P = shadow_ray_offset(kg, sd, ls->D);
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
-/* Indirect Lamp Emission */
-
-ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadiance *L,
-                                                    Ray *ray,
-                                                    float3 throughput)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_volume_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                         const ShaderData *ccl_restrict sd,
+                                                         const LightSample *ccl_restrict ls,
+                                                         const float3 P,
+                                                         Ray *ray)
 {
-  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    LightSample ls ccl_optional_struct_init;
-
-    if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
-      continue;
-
-#ifdef __PASSES__
-    /* use visibility flag to skip lights */
-    if (ls.shader & SHADER_EXCLUDE_ANY) {
-      if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-          ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
-           ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-            (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-          ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-          ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-        continue;
-    }
-#endif
-
-    float3 lamp_L = direct_emissive_eval(
-        kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time);
-
-#ifdef __VOLUME__
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      /* shadow attenuation */
-      Ray volume_ray = *ray;
-      volume_ray.t = ls.t;
-      float3 volume_tp = one_float3();
-      kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
-      lamp_L *= volume_tp;
-    }
-#endif
-
-    if (!(state->flag & PATH_RAY_MIS_SKIP)) {
-      /* multiple importance sampling, get regular light pdf,
-       * and compute weight with respect to BSDF pdf */
-      float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
-      lamp_L *= mis_weight;
-    }
-
-    path_radiance_accum_emission(kg, L, state, throughput, lamp_L);
-  }
-}
-
-/* Indirect Background */
-
-ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
-                                                   ShaderData *emission_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_global float *buffer,
-                                                   ccl_addr_space Ray *ray)
-{
-#ifdef __BACKGROUND__
-  int shader = kernel_data.background.surface_shader;
-
-  /* Use visibility flag to skip lights. */
-  if (shader & SHADER_EXCLUDE_ANY) {
-    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-        ((shader & SHADER_EXCLUDE_GLOSSY) &&
-         ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-        ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-        ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
-        ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-      return zero_float3();
-  }
-
-  /* Evaluate background shader. */
-  float3 L = zero_float3();
-  if (!shader_constant_emission_eval(kg, shader, &L)) {
-#  ifdef __SPLIT_KERNEL__
-    Ray priv_ray = *ray;
-    shader_setup_from_background(kg, emission_sd, &priv_ray);
-#  else
-    shader_setup_from_background(kg, emission_sd, ray);
-#  endif
-
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
-
-    L = shader_background_eval(emission_sd);
-  }
-
-  /* Background MIS weights. */
-#  ifdef __BACKGROUND_MIS__
-  /* Check if background light exists or if we should skip pdf. */
-  if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
-    /* multiple importance sampling, get background light pdf for ray
-     * direction, and compute weight with respect to BSDF pdf */
-    float pdf = background_light_pdf(kg, ray->P, ray->D);
-    float mis_weight = power_heuristic(state->ray_pdf, pdf);
-
-    return L * mis_weight;
-  }
-#  endif
-
-  return L;
-#else
-  return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index a6fd4f1dc7e..fa93f4830d1 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -14,119 +14,516 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 film_get_pass_result(KernelGlobals *kg,
-                                       ccl_global float *buffer,
-                                       float sample_scale,
-                                       int index,
-                                       bool use_display_sample_scale)
-{
-  float4 pass_result;
-
-  int display_pass_stride = kernel_data.film.display_pass_stride;
-  int display_pass_components = kernel_data.film.display_pass_components;
-
-  if (display_pass_components == 4) {
-    float4 in = *(ccl_global float4 *)(buffer + display_pass_stride +
-                                       index * kernel_data.film.pass_stride);
-    float alpha = use_display_sample_scale ?
-                      (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) :
-                      1.0f;
-
-    pass_result = make_float4(in.x, in.y, in.z, alpha);
-
-    int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride;
-    if (display_divide_pass_stride != -1) {
-      ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
-                                                           index * kernel_data.film.pass_stride);
-      float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
-                                              float4_to_float3(*divide_in));
-      pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
-    }
+/* --------------------------------------------------------------------
+ * Common utilities.
+ */
 
-    if (kernel_data.film.use_display_exposure) {
-      float exposure = kernel_data.film.exposure;
-      pass_result *= make_float4(exposure, exposure, exposure, 1.0f);
-    }
+/* The input buffer contains transparency = 1 - alpha, this converts it to
+ * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian
+ * roulette. */
+ccl_device_forceinline float film_transparency_to_alpha(float transparency)
+{
+  return saturate(1.0f - transparency);
+}
+
+ccl_device_inline float film_get_scale(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+    return 1.0f / sample_count;
+  }
+
+  return 1.0f;
+}
+
+ccl_device_inline float film_get_scale_exposure(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale_exposure;
+  }
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  if (kfilm_convert->pass_use_exposure) {
+    return scale * kfilm_convert->exposure;
+  }
+
+  return scale;
+}
+
+ccl_device_inline bool film_get_scale_and_scale_exposure(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict scale,
+    float *ccl_restrict scale_exposure)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    *scale = kfilm_convert->scale;
+    *scale_exposure = kfilm_convert->scale_exposure;
+    return true;
+  }
+
+  const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+  if (!sample_count) {
+    *scale = 0.0f;
+    *scale_exposure = 0.0f;
+    return false;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    *scale = 1.0f / sample_count;
   }
-  else if (display_pass_components == 1) {
-    ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride +
-                                                index * kernel_data.film.pass_stride);
-    pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale);
+  else {
+    *scale = 1.0f;
+  }
+
+  if (kfilm_convert->pass_use_exposure) {
+    *scale_exposure = *scale * kfilm_convert->exposure;
+  }
+  else {
+    *scale_exposure = *scale;
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Float (scalar) passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_depth(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+}
+
+ccl_device_inline void film_get_pass_pixel_mist(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer,
+                                                float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  /* Note that we accumulate 1 - mist in the kernel to avoid having to
+   * track the mist values in the integrator state. */
+  pixel[0] = saturate(1.0f - f * scale_exposure);
+}
+
+ccl_device_inline void film_get_pass_pixel_sample_count(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+   * meaningful value when adaptive sampler stopped rendering image way before the maximum
+   * number of samples was reached (for examples when number of samples is set to 0 in
+   * viewport). */
+
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = f * scale_exposure;
+}
+
+/* --------------------------------------------------------------------
+ * Float 3 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_light_path(const KernelFilmConvert *ccl_restrict
+                                                          kfilm_convert,
+                                                      ccl_global const float *ccl_restrict buffer,
+                                                      float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  /* Read light pass. */
+  const float *in = buffer + kfilm_convert->pass_offset;
+  float3 f = make_float3(in[0], in[1], in[2]);
+
+  /* Optionally add indirect light pass. */
+  if (kfilm_convert->pass_indirect != PASS_UNUSED) {
+    const float *in_indirect = buffer + kfilm_convert->pass_indirect;
+    const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]);
+    f += f_indirect;
+  }
+
+  /* Optionally divide out color. */
+  if (kfilm_convert->pass_divide != PASS_UNUSED) {
+    const float *in_divide = buffer + kfilm_convert->pass_divide;
+    const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+    f = safe_divide_even_color(f, f_divide);
+
+    /* Exposure only, sample scale cancels out. */
+    f *= kfilm_convert->exposure;
+  }
+  else {
+    /* Sample scale and exposure. */
+    f *= film_get_scale_exposure(kfilm_convert, buffer);
+  }
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_float3(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure;
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+/* --------------------------------------------------------------------
+ * Float4 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_motion(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float *in_weight = buffer + kfilm_convert->pass_motion_weight;
+
+  const float weight = in_weight[0];
+  const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f;
+
+  const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv;
+
+  pixel[0] = motion.x;
+  pixel[1] = motion.y;
+  pixel[2] = motion.z;
+  pixel[3] = motion.w;
+}
+
+ccl_device_inline void film_get_pass_pixel_cryptomatte(const KernelFilmConvert *ccl_restrict
+                                                           kfilm_convert,
+                                                       ccl_global const float *ccl_restrict buffer,
+                                                       float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+  /* x and z contain integer IDs, don't rescale them.
+   * y and w contain matte weights, they get scaled. */
+  pixel[0] = f.x;
+  pixel[1] = f.y * scale;
+  pixel[2] = f.z;
+  pixel[3] = f.w * scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float4(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = alpha;
+}
+
+ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl_restrict
+                                                        kfilm_convert,
+                                                    ccl_global const float *ccl_restrict buffer,
+                                                    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+
+  /* 3rd channel contains transparency = 1 - alpha for the combined pass. */
+
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    pixel[0] = 0.0f;
+    pixel[1] = 0.0f;
+    pixel[2] = 0.0f;
+    pixel[3] = 0.0f;
+    return;
   }
 
-  return pass_result;
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = film_transparency_to_alpha(alpha);
 }
 
-ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale)
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_device_inline float3
+film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
 {
-  float4 result;
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
 
-  /* Conversion to SRGB. */
-  result.x = color_linear_to_srgb(rgba_in.x * scale);
-  result.y = color_linear_to_srgb(rgba_in.y * scale);
-  result.z = color_linear_to_srgb(rgba_in.z * scale);
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
 
-  /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-  result.w = saturate(rgba_in.w * scale);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
 
-  return result;
+  const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
+
+  return pixel;
 }
 
-ccl_device uchar4 film_float_to_byte(float4 color)
+ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b)
 {
-  uchar4 result;
+  float x, y, z;
 
-  /* simple float to byte conversion */
-  result.x = (uchar)(saturate(color.x) * 255.0f);
-  result.y = (uchar)(saturate(color.y) * 255.0f);
-  result.z = (uchar)(saturate(color.z) * 255.0f);
-  result.w = (uchar)(saturate(color.w) * 255.0f);
+  x = (b.x != 0.0f) ? a.x / b.x : 1.0f;
+  y = (b.y != 0.0f) ? a.y / b.y : 1.0f;
+  z = (b.z != 0.0f) ? a.z / b.z : 1.0f;
 
-  return result;
+  return make_float3(x, y, z);
 }
 
-ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
-                                            ccl_global uchar4 *rgba,
-                                            ccl_global float *buffer,
-                                            float sample_scale,
-                                            int x,
-                                            int y,
-                                            int offset,
-                                            int stride)
+ccl_device_inline float3
+film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                              ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* For the shadow catcher pass we divide combined pass by the shadow catcher.
+   * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not
+   * to be calculated as division). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  if (kfilm_convert->is_denoised) {
+    return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer);
+  }
 
-  /* map colors */
-  float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
-  uchar4 uchar_result = film_float_to_byte(float_result);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED);
 
-  rgba += index;
-  *rgba = uchar_result;
+  /* If there is no shadow catcher object in this pixel, there is no modification of the light
+   * needed, so return one. */
+  ccl_global const float *in_catcher_sample_count =
+      buffer + kfilm_convert->pass_shadow_catcher_sample_count;
+  const float num_samples = in_catcher_sample_count[0];
+  if (num_samples == 0.0f) {
+    return one_float3();
+  }
+
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+  /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
+   * shadow catcher objects in the scene. In this case there will be no auxillary passes required
+   * for the devision (to save up memory). So delay the asserts to this point so that the number of
+   * samples check handles such configuration. */
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  /* No scaling needed. The integration works in way that number of samples in the combined and
+   * shadow catcher passes are the same, and exposure is cancelled during the division. */
+  const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]);
+  const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]);
+
+  /* Need to ignore contribution of the matte object when doing division (otherwise there will be
+   * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need
+   * to contain matte objects, we subtrack matte objects contribution here. This is the same as if
+   * the matte objects were not accumulated to the combined pass. */
+  const float3 combined_no_matte = color_combined - color_matte;
+
+  const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+  const float transparency = in_combined[3] * scale;
+  const float alpha = film_transparency_to_alpha(transparency);
+
+  /* Alpha-over on white using transparency of the combined pass. This allows to eliminate
+   * artifacts which are happenning on an edge of a shadow catcher when using transparent film.
+   * Note that we treat shadow catcher as straight alpha here because alpha got cancelled out
+   * during the division. */
+  const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
+
+  return pixel;
 }
 
-ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
-                                                  ccl_global uchar4 *rgba,
-                                                  ccl_global float *buffer,
-                                                  float sample_scale,
-                                                  int x,
-                                                  int y,
-                                                  int offset,
-                                                  int stride)
+ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation
+   * is possible.
+   *
+   * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage,
+   * and then alpha-overing synthetic objects on top). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
+
+  const float transparency = in_matte[3] * scale;
+  const float alpha = saturate(1.0f - transparency);
+
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
+
+  if (kfilm_convert->use_approximate_shadow_catcher_background) {
+    kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
+
+    ccl_global const float *in_background = buffer + kfilm_convert->pass_background;
+    const float3 color_background = make_float3(
+                                        in_background[0], in_background[1], in_background[2]) *
+                                    scale_exposure;
+    const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte);
+    return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f);
+  }
 
-  ccl_global half *out = (ccl_global half *)rgba + index * 4;
-  float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
+  return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte);
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+
+  const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4);
+
+  const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert,
+                                                                             buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+  if (kfilm_convert->num_components == 4) {
+    pixel[3] = pixel_value.w;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Compositing and overlays.
+ */
+
+ccl_device_inline void film_apply_pass_pixel_overlays_rgba(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  if (kfilm_convert->show_active_pixels &&
+      kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) {
+    if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) {
+      const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f);
+      const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f);
+      pixel[0] = mix_rgb.x;
+      pixel[1] = mix_rgb.y;
+      pixel[2] = mix_rgb.z;
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
deleted file mode 100644
index 70aed6d54ed..00000000000
--- a/intern/cycles/kernel/kernel_globals.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Constant Globals */
-
-#ifndef __KERNEL_GLOBALS_H__
-#define __KERNEL_GLOBALS_H__
-
-#include "kernel/kernel_profiling.h"
-
-#ifdef __KERNEL_CPU__
-#  include "util/util_map.h"
-#  include "util/util_vector.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "util/util_atomic.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
- * the kernel, to access constant data. These are all stored as "textures", but
- * these are really just standard arrays. We can't use actually globals because
- * multiple renders may be running inside the same process. */
-
-#ifdef __KERNEL_CPU__
-
-#  ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-#  endif
-
-typedef unordered_map<float, float> CoverageMap;
-
-struct Intersection;
-struct VolumeStep;
-
-typedef struct KernelGlobals {
-#  define KERNEL_TEX(type, name) texture<type> name;
-#  include "kernel/kernel_textures.h"
-
-  KernelData __data;
-
-#  ifdef __OSL__
-  /* On the CPU, we also have the OSL globals here. Most data structures are shared
-   * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl;
-  OSLShadingSystem *osl_ss;
-  OSLThreadData *osl_tdata;
-#  endif
-
-  /* **** Run-time data **** */
-
-  /* Heap-allocated storage for transparent shadows intersections. */
-  Intersection *transparent_shadow_intersections;
-
-  /* Storage for decoupled volume steps. */
-  VolumeStep *decoupled_volume_steps[2];
-  int decoupled_volume_steps_index;
-
-  /* A buffer for storing per-pixel coverage for Cryptomatte. */
-  CoverageMap *coverage_object;
-  CoverageMap *coverage_material;
-  CoverageMap *coverage_asset;
-
-  /* split kernel */
-  SplitData split_data;
-  SplitParams split_param_data;
-
-  int2 global_size;
-  int2 global_id;
-
-  ProfilingState profiler;
-} KernelGlobals;
-
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_OPTIX__
-
-typedef struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-} ShaderParams;
-
-typedef struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-} KernelParams;
-
-typedef struct KernelGlobals {
-#  ifdef __VOLUME__
-  VolumeState volume_state;
-#  endif
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-extern "C" __constant__ KernelParams __params;
-
-#else /* __KERNEL_OPTIX__ */
-
-/* For CUDA, constant memory textures must be globals, so we can't put them
- * into a struct. As a result we don't actually use this struct and use actual
- * globals and simply pass along a NULL pointer everywhere, which we hope gets
- * optimized out. */
-
-#  ifdef __KERNEL_CUDA__
-
-__constant__ KernelData __data;
-typedef struct KernelGlobals {
-  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-#    define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#    include "kernel/kernel_textures.h"
-
-#  endif /* __KERNEL_CUDA__ */
-
-#endif /* __KERNEL_OPTIX__ */
-
-/* OpenCL */
-
-#ifdef __KERNEL_OPENCL__
-
-#  define KERNEL_TEX(type, name) typedef type name##_t;
-#  include "kernel/kernel_textures.h"
-
-typedef ccl_addr_space struct KernelGlobals {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-
-#  ifdef __SPLIT_KERNEL__
-  SplitData split_data;
-  SplitParams split_param_data;
-#  endif
-} KernelGlobals;
-
-#  define KERNEL_BUFFER_PARAMS \
-    ccl_global char *buffer0, ccl_global char *buffer1, ccl_global char *buffer2, \
-        ccl_global char *buffer3, ccl_global char *buffer4, ccl_global char *buffer5, \
-        ccl_global char *buffer6, ccl_global char *buffer7
-
-#  define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
-
-ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    kg->buffers[0] = buffer0;
-    kg->buffers[1] = buffer1;
-    kg->buffers[2] = buffer2;
-    kg->buffers[3] = buffer3;
-    kg->buffers[4] = buffer4;
-    kg->buffers[5] = buffer5;
-    kg->buffers[6] = buffer6;
-    kg->buffers[7] = buffer7;
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    ccl_global TextureInfo *info = (ccl_global TextureInfo *)kg->buffers[0];
-
-#  define KERNEL_TEX(type, name) kg->name = *(info++);
-#  include "kernel/kernel_textures.h"
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-#endif /* __KERNEL_OPENCL__ */
-
-/* Interpolated lookup table access */
-
-ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
-{
-  x = saturate(x) * (size - 1);
-
-  int index = min(float_to_int(x), size - 1);
-  int nindex = min(index + 1, size - 1);
-  float t = x - index;
-
-  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-ccl_device float lookup_table_read_2D(
-    KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
-{
-  y = saturate(y) * (ysize - 1);
-
-  int index = min(float_to_int(y), ysize - 1);
-  int nindex = min(index + 1, ysize - 1);
-  float t = y - index;
-
-  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
index 1ca42e933d1..ed01f494f98 100644
--- a/intern/cycles/kernel/kernel_id_passes.h
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -14,8 +14,18 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
+/* Element of ID pass stored in the render buffers.
+ * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
+ * render buffers might not meet expected by compiler alignment. */
+typedef struct IDPassBufferElement {
+  float x;
+  float y;
+} IDPassBufferElement;
+
 ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
                                              int num_slots,
                                              float id,
@@ -27,7 +37,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
   }
 
   for (int slot = 0; slot < num_slots; slot++) {
-    ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+    ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
 #ifdef __ATOMIC_PASS_WRITE__
     /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
     if (id_buffer[slot].x == ID_NONE) {
@@ -65,7 +75,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
 
 ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
 {
-  ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+  ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
   for (int slot = 1; slot < num_slots; ++slot) {
     if (id_buffer[slot].x == ID_NONE) {
       return;
@@ -73,7 +83,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
     /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
     int i = slot;
     while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
-      float2 swap = id_buffer[i];
+      const IDPassBufferElement swap = id_buffer[i];
       id_buffer[i] = id_buffer[i - 1];
       id_buffer[i - 1] = swap;
       --i;
@@ -81,19 +91,16 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
   }
 }
 
-#ifdef __KERNEL_GPU__
 /* post-sorting for Cryptomatte */
-ccl_device void kernel_cryptomatte_post(
-    KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+ccl_device_inline void kernel_cryptomatte_post(const KernelGlobals *kg,
+                                               ccl_global float *render_buffer,
+                                               int pixel_index)
 {
-  if (sample - 1 == kernel_data.integrator.aa_samples) {
-    int index = offset + x + y * stride;
-    int pass_stride = kernel_data.film.pass_stride;
-    ccl_global float *cryptomatte_buffer = buffer + index * pass_stride +
-                                           kernel_data.film.pass_cryptomatte;
-    kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-  }
+  const int pass_stride = kernel_data.film.pass_stride;
+  const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
+  ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
+                                         kernel_data.film.pass_cryptomatte;
+  kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
 }
-#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index f4e60a807f7..354e8115538 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,93 +14,27 @@
  * limitations under the License.
  */
 
-/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
-
+#pragma once
 CCL_NAMESPACE_BEGIN
 
-/* "Correlated Multi-Jittered Sampling"
- * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
-
-/* TODO: find good value, suggested 64 gives pattern on cornell box ceiling. */
-#define CMJ_RANDOM_OFFSET_LIMIT 4096
-
-ccl_device_inline bool cmj_is_pow2(int i)
+ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed)
 {
-  return (i > 1) && ((i & (i - 1)) == 0);
-}
+  x += seed;
+  x ^= (x * 0x6c50b47cu);
+  x ^= x * 0xb82f1e52u;
+  x ^= x * 0xc7afe638u;
+  x ^= x * 0x8d22f6e6u;
 
-ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
-{
-  return (a & (b - 1));
+  return x;
 }
 
-/* b must be > 1 */
-ccl_device_inline int cmj_fast_div_pow2(int a, int b)
+ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed)
 {
-  kernel_assert(b > 1);
-  return a >> count_trailing_zeros(b);
-}
+  x = reverse_integer_bits(x);
+  x = laine_karras_permutation(x, seed);
+  x = reverse_integer_bits(x);
 
-ccl_device_inline uint cmj_w_mask(uint w)
-{
-  kernel_assert(w > 1);
-  return ((1 << (32 - count_leading_zeros(w))) - 1);
-}
-
-ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
-{
-  uint w = l - 1;
-
-  if ((l & w) == 0) {
-    /* l is a power of two (fast) */
-    i ^= p;
-    i *= 0xe170893d;
-    i ^= p >> 16;
-    i ^= (i & w) >> 4;
-    i ^= p >> 8;
-    i *= 0x0929eb3f;
-    i ^= p >> 23;
-    i ^= (i & w) >> 1;
-    i *= 1 | p >> 27;
-    i *= 0x6935fa69;
-    i ^= (i & w) >> 11;
-    i *= 0x74dcb303;
-    i ^= (i & w) >> 2;
-    i *= 0x9e501cc3;
-    i ^= (i & w) >> 2;
-    i *= 0xc860a3df;
-    i &= w;
-    i ^= i >> 5;
-
-    return (i + p) & w;
-  }
-  else {
-    /* l is not a power of two (slow) */
-    w = cmj_w_mask(w);
-
-    do {
-      i ^= p;
-      i *= 0xe170893d;
-      i ^= p >> 16;
-      i ^= (i & w) >> 4;
-      i ^= p >> 8;
-      i *= 0x0929eb3f;
-      i ^= p >> 23;
-      i ^= (i & w) >> 1;
-      i *= 1 | p >> 27;
-      i *= 0x6935fa69;
-      i ^= (i & w) >> 11;
-      i *= 0x74dcb303;
-      i ^= (i & w) >> 2;
-      i *= 0x9e501cc3;
-      i ^= (i & w) >> 2;
-      i *= 0xc860a3df;
-      i &= w;
-      i ^= i >> 5;
-    } while (i >= l);
-
-    return (i + p) % l;
-  }
+  return x;
 }
 
 ccl_device_inline uint cmj_hash(uint i, uint p)
@@ -133,99 +67,101 @@ ccl_device_inline float cmj_randfloat(uint i, uint p)
   return cmj_hash(i, p) * (1.0f / 4294967808.0f);
 }
 
-#ifdef __CMJ__
-ccl_device float cmj_sample_1D(int s, int N, int p)
+ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
 {
-  kernel_assert(s < N);
-
-  uint x = cmj_permute(s, N, p * 0x68bc21eb);
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-
-  float invN = 1.0f / N;
-  return (x + jx) * invN;
+  return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF);
 }
 
-/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
-ccl_device_inline int cmj_isqrt(int value)
+ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
 {
-#  if defined(__KERNEL_CUDA__)
-  return float_to_int(__fsqrt_ru(value));
-#  elif defined(__KERNEL_GPU__)
-  return float_to_int(sqrtf(value));
-#  else
-  /* This is a work around for fast-math on CPU which might replace sqrtf()
-   * with am approximated version.
-   */
-  return float_to_int(sqrtf(value) + 1e-6f);
-#  endif
-}
+  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+   *  the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
+   * independently). */
+
+  /* Perform Owen shuffle of the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
-{
-  kernel_assert(s < N);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
 
-  int m = cmj_isqrt(N);
-  int n = (N - 1) / m + 1;
-  float invN = 1.0f / N;
-  float invm = 1.0f / m;
-  float invn = 1.0f / n;
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+#  else
+  /* Only jitter within the grid interval. */
+  float dx = cmj_randfloat(d, rng_hash);
+#  endif
+  fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+  fx = fx - floorf(fx);
 
-  s = cmj_permute(s, N, p * 0x51633e2d);
+#else
+#  warning "Not using Cranley-Patterson Rotation."
+#endif
 
-  int sdivm, smodm;
+  return fx;
+}
 
-  if (cmj_is_pow2(m)) {
-    sdivm = cmj_fast_div_pow2(s, m);
-    smodm = cmj_fast_mod_pow2(s, m);
-  }
-  else {
-    /* Doing `s * inmv` gives precision issues here. */
-    sdivm = s / m;
-    smodm = s - sdivm * m;
-  }
+ccl_device void pmj_sample_2D(
+    const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension, float *x, float *y)
+{
+  /* Perform a shuffle on the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-  uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
-  uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
 
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-  float jy = cmj_randfloat(s, p * 0x368cc8b7);
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
+  float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1);
 
-  *fx = (sx + (sy + jx) * invn) * invm;
-  *fy = (s + jy) * invN;
-}
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+  float dy = cmj_randfloat_simple(d + 1, rng_hash);
+#  else
+  float dx = cmj_randfloat(d, rng_hash);
+  float dy = cmj_randfloat(d + 1, rng_hash);
+#  endif
+  /* Only jitter within the grid cells. */
+  fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
+  fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
+  fx = fx - floorf(fx);
+  fy = fy - floorf(fy);
+#else
+#  warning "Not using Cranley Patterson Rotation."
 #endif
 
-ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  /* Fallback to random */
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    return cmj_randfloat(sample, p);
-  }
-  else {
-    const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
-  }
-}
-
-ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    const float fx = cmj_randfloat(sample, p);
-    const float fy = cmj_randfloat(sample, p + 1);
-    return make_float2(fx, fy);
-  }
-  else {
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
-    const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
-    const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
-                     1.0f;
-    return make_float2(fx, fy);
-  }
+  (*x) = fx;
+  (*y) = fy;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 42a834d2ce3..52f641634b9 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,7 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "geom/geom.h"
+
 #include "kernel_light_background.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+#include "kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -37,10 +44,22 @@ typedef struct LightSample {
 
 /* Regular Light */
 
-ccl_device_inline bool lamp_light_sample(
-    KernelGlobals *kg, int lamp, float randu, float randv, float3 P, LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_inline bool light_sample(const KernelGlobals *kg,
+                                    const int lamp,
+                                    const float randu,
+                                    const float randv,
+                                    const float3 P,
+                                    const int path_flag,
+                                    LightSample *ls)
 {
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+      return false;
+    }
+  }
+
   LightType type = (LightType)klight->type;
   ls->type = type;
   ls->shader = klight->shader_id;
@@ -50,6 +69,18 @@ ccl_device_inline bool lamp_light_sample(
   ls->u = randu;
   ls->v = randv;
 
+  if (in_volume_segment && (type == LIGHT_DISTANT || type == LIGHT_BACKGROUND)) {
+    /* Distant lights in a volume get a dummy sample, position will not actually
+     * be used in that case. Only when sampling from a specific scatter position
+     * do we actually need to evaluate these. */
+    ls->P = zero_float3();
+    ls->Ng = zero_float3();
+    ls->D = zero_float3();
+    ls->pdf = true;
+    ls->t = FLT_MAX;
+    return true;
+  }
+
   if (type == LIGHT_DISTANT) {
     /* distant light */
     float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
@@ -123,13 +154,15 @@ ccl_device_inline bool lamp_light_sample(
       float invarea = fabsf(klight->area.invarea);
       bool is_round = (klight->area.invarea < 0.0f);
 
-      if (dot(ls->P - P, Ng) > 0.0f) {
-        return false;
+      if (!in_volume_segment) {
+        if (dot(ls->P - P, Ng) > 0.0f) {
+          return false;
+        }
       }
 
       float3 inplane;
 
-      if (is_round) {
+      if (is_round || in_volume_segment) {
         inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
         ls->P += inplane;
         ls->pdf = invarea;
@@ -176,79 +209,180 @@ ccl_device_inline bool lamp_light_sample(
   return (ls->pdf > 0.0f);
 }
 
-ccl_device bool lamp_light_eval(
-    KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+ccl_device bool lights_intersect(const KernelGlobals *ccl_restrict kg,
+                                 const Ray *ccl_restrict ray,
+                                 Intersection *ccl_restrict isect,
+                                 const int last_prim,
+                                 const int last_object,
+                                 const int last_type,
+                                 const int path_flag)
 {
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
-  LightType type = (LightType)klight->type;
-  ls->type = type;
-  ls->shader = klight->shader_id;
-  ls->object = PRIM_NONE;
-  ls->prim = PRIM_NONE;
-  ls->lamp = lamp;
-  /* todo: missing texture coordinates */
-  ls->u = 0.0f;
-  ls->v = 0.0f;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
 
-  if (!(ls->shader & SHADER_USE_MIS))
-    return false;
+    if (path_flag & PATH_RAY_CAMERA) {
+      if (klight->shader_id & SHADER_EXCLUDE_CAMERA) {
+        continue;
+      }
+    }
+    else {
+      if (!(klight->shader_id & SHADER_USE_MIS)) {
+        continue;
+      }
+    }
 
-  if (type == LIGHT_DISTANT) {
-    /* distant light */
-    float radius = klight->distant.radius;
+    if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+      if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+        continue;
+      }
+    }
 
-    if (radius == 0.0f)
-      return false;
-    if (t != FLT_MAX)
-      return false;
+    LightType type = (LightType)klight->type;
+    float t = 0.0f, u = 0.0f, v = 0.0f;
 
-    /* a distant light is infinitely far away, but equivalent to a disk
-     * shaped light exactly 1 unit away from the current shading point.
-     *
-     *     radius              t^2/cos(theta)
-     *  <---------->           t = sqrt(1^2 + tan(theta)^2)
-     *       tan(th)           area = radius*radius*pi
-     *       <----->
-     *        \    |           (1 + tan(theta)^2)/cos(theta)
-     *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
-     *       t  \th| 1         simplifies to
-     *           \-|           1/(cos(theta)^3)
-     *            \|           magic!
-     *             P
-     */
+    if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+      /* Sphere light. */
+      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+      const float radius = klight->spot.radius;
+      if (radius == 0.0f) {
+        continue;
+      }
 
-    float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    float costheta = dot(-lightD, D);
-    float cosangle = klight->distant.cosangle;
+      float3 P;
+      if (!ray_aligned_disk_intersect(ray->P, ray->D, ray->t, lightP, radius, &P, &t)) {
+        continue;
+      }
+    }
+    else if (type == LIGHT_AREA) {
+      /* Area light. */
+      const float invarea = fabsf(klight->area.invarea);
+      const bool is_round = (klight->area.invarea < 0.0f);
+      if (invarea == 0.0f) {
+        continue;
+      }
 
-    if (costheta < cosangle)
-      return false;
+      const float3 axisu = make_float3(
+          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+      const float3 axisv = make_float3(
+          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
 
-    ls->P = -D;
-    ls->Ng = -D;
-    ls->D = D;
-    ls->t = FLT_MAX;
+      /* One sided. */
+      if (dot(ray->D, Ng) >= 0.0f) {
+        continue;
+      }
 
-    /* compute pdf */
-    float invarea = klight->distant.invarea;
-    ls->pdf = invarea / (costheta * costheta * costheta);
-    ls->eval_fac = ls->pdf;
+      const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+
+      float3 P;
+      if (!ray_quad_intersect(
+              ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+        continue;
+      }
+    }
+    else {
+      continue;
+    }
+
+    if (t < isect->t &&
+        !(last_prim == lamp && last_object == OBJECT_NONE && last_type == PRIMITIVE_LAMP)) {
+      isect->t = t;
+      isect->u = u;
+      isect->v = v;
+      isect->type = PRIMITIVE_LAMP;
+      isect->prim = lamp;
+      isect->object = OBJECT_NONE;
+    }
+  }
+
+  return isect->prim != PRIM_NONE;
+}
+
+ccl_device bool light_sample_from_distant_ray(const KernelGlobals *ccl_restrict kg,
+                                              const float3 ray_D,
+                                              const int lamp,
+                                              LightSample *ccl_restrict ls)
+{
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  const int shader = klight->shader_id;
+  const float radius = klight->distant.radius;
+  const LightType type = (LightType)klight->type;
+
+  if (type != LIGHT_DISTANT) {
+    return false;
+  }
+  if (!(shader & SHADER_USE_MIS)) {
+    return false;
+  }
+  if (radius == 0.0f) {
+    return false;
   }
-  else if (type == LIGHT_POINT || type == LIGHT_SPOT) {
-    float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    float radius = klight->spot.radius;
+  /* a distant light is infinitely far away, but equivalent to a disk
+   * shaped light exactly 1 unit away from the current shading point.
+   *
+   *     radius              t^2/cos(theta)
+   *  <---------->           t = sqrt(1^2 + tan(theta)^2)
+   *       tan(th)           area = radius*radius*pi
+   *       <----->
+   *        \    |           (1 + tan(theta)^2)/cos(theta)
+   *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
+   *       t  \th| 1         simplifies to
+   *           \-|           1/(cos(theta)^3)
+   *            \|           magic!
+   *             P
+   */
+
+  float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+  float costheta = dot(-lightD, ray_D);
+  float cosangle = klight->distant.cosangle;
+
+  if (costheta < cosangle)
+    return false;
 
-    /* sphere light */
-    if (radius == 0.0f)
-      return false;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->u = 0.0f;
+  ls->v = 0.0f;
+  ls->t = FLT_MAX;
+  ls->P = -ray_D;
+  ls->Ng = -ray_D;
+  ls->D = ray_D;
+
+  /* compute pdf */
+  float invarea = klight->distant.invarea;
+  ls->pdf = invarea / (costheta * costheta * costheta);
+  ls->pdf *= kernel_data.integrator.pdf_lights;
+  ls->eval_fac = ls->pdf;
 
-    if (!ray_aligned_disk_intersect(P, D, t, lightP, radius, &ls->P, &ls->t)) {
-      return false;
-    }
+  return true;
+}
 
-    ls->Ng = -D;
-    ls->D = D;
+ccl_device bool light_sample_from_intersection(const KernelGlobals *ccl_restrict kg,
+                                               const Intersection *ccl_restrict isect,
+                                               const float3 ray_P,
+                                               const float3 ray_D,
+                                               LightSample *ccl_restrict ls)
+{
+  const int lamp = isect->prim;
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  LightType type = (LightType)klight->type;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->t = isect->t;
+  ls->P = ray_P + ray_D * ls->t;
+  ls->D = ray_D;
+
+  if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+    ls->Ng = -ray_D;
 
     float invarea = klight->spot.invarea;
     ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
@@ -260,8 +394,9 @@ ccl_device bool lamp_light_eval(
       ls->eval_fac *= spot_light_attenuation(
           dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
 
-      if (ls->eval_fac == 0.0f)
+      if (ls->eval_fac == 0.0f) {
         return false;
+      }
     }
     float2 uv = map_to_sphere(ls->Ng);
     ls->u = uv.x;
@@ -274,31 +409,22 @@ ccl_device bool lamp_light_eval(
   else if (type == LIGHT_AREA) {
     /* area light */
     float invarea = fabsf(klight->area.invarea);
-    bool is_round = (klight->area.invarea < 0.0f);
-    if (invarea == 0.0f)
-      return false;
 
     float3 axisu = make_float3(
         klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
     float3 axisv = make_float3(
         klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
     float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-    /* one sided */
-    if (dot(D, Ng) >= 0.0f)
-      return false;
-
     float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    if (!ray_quad_intersect(
-            P, D, 0.0f, t, light_P, axisu, axisv, Ng, &ls->P, &ls->t, &ls->u, &ls->v, is_round)) {
-      return false;
-    }
-
-    ls->D = D;
+    ls->u = isect->u;
+    ls->v = isect->v;
+    ls->D = ray_D;
     ls->Ng = Ng;
+
+    const bool is_round = (klight->area.invarea < 0.0f);
     if (is_round) {
-      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
+      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t);
     }
     else {
       float3 sample_axisu = axisu;
@@ -306,12 +432,12 @@ ccl_device bool lamp_light_eval(
 
       if (klight->area.tan_spread > 0.0f) {
         if (!light_spread_clamp_area_light(
-                P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+                ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
           return false;
         }
       }
 
-      ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
+      ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
     }
     ls->eval_fac = 0.25f * invarea;
 
@@ -325,6 +451,7 @@ ccl_device bool lamp_light_eval(
     }
   }
   else {
+    kernel_assert(!"Invalid lamp type in light_sample_from_intersection");
     return false;
   }
 
@@ -337,7 +464,7 @@ ccl_device bool lamp_light_eval(
 
 /* returns true if the triangle is has motion blur or an instancing transform applied */
 ccl_device_inline bool triangle_world_space_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 V[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
   bool has_motion = false;
   const int object_flag = kernel_tex_fetch(__object_flag, object);
@@ -365,7 +492,7 @@ ccl_device_inline bool triangle_world_space_vertices(
   return has_motion;
 }
 
-ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
+ccl_device_inline float triangle_light_pdf_area(const KernelGlobals *kg,
                                                 const float3 Ng,
                                                 const float3 I,
                                                 float t)
@@ -379,7 +506,9 @@ ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
   return t * t * pdf / cos_pi;
 }
 
-ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+ccl_device_forceinline float triangle_light_pdf(const KernelGlobals *kg,
+                                                const ShaderData *sd,
+                                                float t)
 {
   /* A naive heuristic to decide between costly solid angle sampling
    * and simple area sampling, comparing the distance to the triangle plane
@@ -448,7 +577,8 @@ ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *s
   }
 }
 
-ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
+template<bool in_volume_segment>
+ccl_device_forceinline void triangle_light_sample(const KernelGlobals *kg,
                                                   int prim,
                                                   int object,
                                                   float randu,
@@ -488,7 +618,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
   float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
 
-  if (longest_edge_squared > distance_to_plane * distance_to_plane) {
+  if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
     /* see James Arvo, "Stratified Sampling of Spherical Triangles"
      * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
 
@@ -617,7 +747,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
+ccl_device int light_distribution_sample(const KernelGlobals *kg, float *randu)
 {
   /* This is basically std::upper_bound as used by PBRT, to find a point light or
    * triangle to emit from, proportional to area. a good improvement would be to
@@ -655,51 +785,93 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 
 /* Generic Light */
 
-ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+ccl_device_inline bool light_select_reached_max_bounces(const KernelGlobals *kg,
+                                                        int index,
+                                                        int bounce)
 {
   return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
 }
 
-ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      int lamp,
-                                      float randu,
-                                      float randv,
-                                      float time,
-                                      float3 P,
-                                      int bounce,
-                                      LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_noinline bool light_distribution_sample(const KernelGlobals *kg,
+                                                   float randu,
+                                                   const float randv,
+                                                   const float time,
+                                                   const float3 P,
+                                                   const int bounce,
+                                                   const int path_flag,
+                                                   LightSample *ls)
 {
-  if (lamp < 0) {
-    /* sample index */
-    int index = light_distribution_sample(kg, &randu);
-
-    /* fetch light data */
-    const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(
-        __light_distribution, index);
-    int prim = kdistribution->prim;
-
-    if (prim >= 0) {
-      int object = kdistribution->mesh_light.object_id;
-      int shader_flag = kdistribution->mesh_light.shader_flag;
-
-      triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
-      ls->shader |= shader_flag;
-      return (ls->pdf > 0.0f);
+  /* Sample light index from distribution. */
+  const int index = light_distribution_sample(kg, &randu);
+  const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution,
+                                                                              index);
+  const int prim = kdistribution->prim;
+
+  if (prim >= 0) {
+    /* Mesh light. */
+    const int object = kdistribution->mesh_light.object_id;
+
+    /* Exclude synthetic meshes from shadow catcher pass. */
+    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+        !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
+      return false;
     }
 
-    lamp = -prim - 1;
+    const int shader_flag = kdistribution->mesh_light.shader_flag;
+    triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P);
+    ls->shader |= shader_flag;
+    return (ls->pdf > 0.0f);
   }
 
+  const int lamp = -prim - 1;
+
   if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
     return false;
   }
 
-  return lamp_light_sample(kg, lamp, randu, randv, P, ls);
+  return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_volume_segment(const KernelGlobals *kg,
+                                                                     float randu,
+                                                                     const float randv,
+                                                                     const float time,
+                                                                     const float3 P,
+                                                                     const int bounce,
+                                                                     const int path_flag,
+                                                                     LightSample *ls)
+{
+  return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_position(const KernelGlobals *kg,
+                                                               float randu,
+                                                               const float randv,
+                                                               const float time,
+                                                               const float3 P,
+                                                               const int bounce,
+                                                               const int path_flag,
+                                                               LightSample *ls)
+{
+  return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls);
 }
 
-ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device_inline bool light_distribution_sample_new_position(const KernelGlobals *kg,
+                                                              const float randu,
+                                                              const float randv,
+                                                              const float time,
+                                                              const float3 P,
+                                                              LightSample *ls)
 {
-  return kernel_tex_fetch(__lights, index).samples;
+  /* Sample a new position on the same light, for volume sampling. */
+  if (ls->type == LIGHT_TRIANGLE) {
+    triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P);
+    return (ls->pdf > 0.0f);
+  }
+  else {
+    return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
index f0f64ce8704..493ed560bc6 100644
--- a/intern/cycles/kernel/kernel_light_background.h
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "kernel_light_common.h"
 
 CCL_NAMESPACE_BEGIN
@@ -22,7 +24,10 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BACKGROUND_MIS__
 
-ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_map_sample(const KernelGlobals *kg,
+                                        float randu,
+                                        float randv,
+                                        float *pdf)
 {
   /* for the following, the CDF values are actually a pair of floats, with the
    * function value as X and the actual CDF as Y.  The last entry's function
@@ -104,7 +109,7 @@ ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float ra
 /* TODO(sergey): Same as above, after the release we should consider using
  * 'noinline' for all devices.
  */
-ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_map_pdf(const KernelGlobals *kg, float3 direction)
 {
   float2 uv = direction_to_equirectangular(direction);
   int res_x = kernel_data.background.map_res_x;
@@ -138,7 +143,7 @@ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
 }
 
 ccl_device_inline bool background_portal_data_fetch_and_check_side(
-    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+    const KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
 {
   int portal = kernel_data.background.portal_offset + index;
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
@@ -154,7 +159,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(
 }
 
 ccl_device_inline float background_portal_pdf(
-    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+    const KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
 {
   float portal_pdf = 0.0f;
 
@@ -214,7 +219,7 @@ ccl_device_inline float background_portal_pdf(
   return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
 }
 
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+ccl_device int background_num_possible_portals(const KernelGlobals *kg, float3 P)
 {
   int num_possible_portals = 0;
   for (int p = 0; p < kernel_data.background.num_portals; p++) {
@@ -225,7 +230,7 @@ ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
   return num_possible_portals;
 }
 
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
+ccl_device float3 background_portal_sample(const KernelGlobals *kg,
                                            float3 P,
                                            float randu,
                                            float randv,
@@ -280,7 +285,7 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
   return zero_float3();
 }
 
-ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+ccl_device_inline float3 background_sun_sample(const KernelGlobals *kg,
                                                float randu,
                                                float randv,
                                                float *pdf)
@@ -292,7 +297,7 @@ ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
   return D;
 }
 
-ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+ccl_device_inline float background_sun_pdf(const KernelGlobals *kg, float3 D)
 {
   const float3 N = float4_to_float3(kernel_data.background.sun);
   const float angle = kernel_data.background.sun.w;
@@ -300,7 +305,7 @@ ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
 }
 
 ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+background_light_sample(const KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
@@ -400,7 +405,7 @@ background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, f
   return D;
 }
 
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+ccl_device float background_light_pdf(const KernelGlobals *kg, float3 P, float3 direction)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
index 4a683d36226..765d8f5338e 100644
--- a/intern/cycles/kernel/kernel_light_common.h
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Area light sampling */
@@ -210,7 +214,7 @@ ccl_device bool light_spread_clamp_area_light(const float3 P,
   return true;
 }
 
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(const KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
   float cos_pi = dot(Ng, I);
 
diff --git a/intern/cycles/kernel/kernel_lookup_table.h b/intern/cycles/kernel/kernel_lookup_table.h
new file mode 100644
index 00000000000..33d9d5ae1f0
--- /dev/null
+++ b/intern/cycles/kernel/kernel_lookup_table.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Interpolated lookup table access */
+
+ccl_device float lookup_table_read(const KernelGlobals *kg, float x, int offset, int size)
+{
+  x = saturate(x) * (size - 1);
+
+  int index = min(float_to_int(x), size - 1);
+  int nindex = min(index + 1, size - 1);
+  float t = x - index;
+
+  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+ccl_device float lookup_table_read_2D(
+    const KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
+{
+  y = saturate(y) * (ysize - 1);
+
+  int index = min(float_to_int(y), ysize - 1);
+  int nindex = min(index + 1, ysize - 1);
+  float t = y - index;
+
+  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..3c5ab95bbc8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_MATH_H__
-#define __KERNEL_MATH_H__
+#pragma once
 
 #include "util/util_color.h"
 #include "util/util_math.h"
@@ -24,5 +23,3 @@
 #include "util/util_projection.h"
 #include "util/util_texture.h"
 #include "util/util_transform.h"
-
-#endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index ce37bd0b15e..b158f4c4fd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_MONTECARLO_CL__
-#define __KERNEL_MONTECARLO_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -300,5 +299,3 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 8f58b8c3079..67466b28170 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,61 +14,52 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/geom/geom.h"
+
 #include "kernel/kernel_id_passes.h"
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __DENOISING_FEATURES__
-
-ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
-                                                     ccl_global float *buffer,
-                                                     int sample,
-                                                     float path_total,
-                                                     float path_total_shaded)
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
 {
-  if (kernel_data.film.pass_denoising_data == 0)
-    return;
-
-  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
-                DENOISING_PASS_SHADOW_B :
-                DENOISING_PASS_SHADOW_A;
-
-  path_total = ensure_finite(path_total);
-  path_total_shaded = ensure_finite(path_total_shaded);
-
-  kernel_write_pass_float(buffer, path_total);
-  kernel_write_pass_float(buffer + 1, path_total_shaded);
-
-  float value = path_total_shaded / max(path_total, 1e-7f);
-  kernel_write_pass_float(buffer + 2, value * value);
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
+#ifdef __DENOISING_FEATURES__
+
+ccl_device_forceinline void kernel_write_denoising_features_surface(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (state->denoising_feature_weight == 0.0f) {
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DENOISING_FEATURES)) {
     return;
   }
 
-  L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
-
   /* Skip implicitly transparent surfaces. */
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return;
   }
 
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
   float3 normal = zero_float3();
   float3 diffuse_albedo = zero_float3();
   float3 specular_albedo = zero_float3();
   float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
-    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
       continue;
+    }
 
     /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
     normal += sc->N * sc->sample_weight;
@@ -106,140 +97,208 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
       normal /= sum_weight;
     }
 
-    /* Transform normal into camera space. */
-    const Transform worldtocamera = kernel_data.cam.worldtocamera;
-    normal = transform_direction(&worldtocamera, normal);
+    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+      /* Transform normal into camera space. */
+      const Transform worldtocamera = kernel_data.cam.worldtocamera;
+      normal = transform_direction(&worldtocamera, normal);
+
+      const float3 denoising_normal = ensure_finite3(normal);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+    }
 
-    L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
-    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
-                                          state->denoising_feature_throughput * diffuse_albedo);
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
+                                                     diffuse_albedo);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
 
-    state->denoising_feature_weight = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
   }
   else {
-    state->denoising_feature_throughput *= specular_albedo;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) *= specular_albedo;
+  }
+}
+
+ccl_device_forceinline void kernel_write_denoising_features_volume(INTEGRATOR_STATE_ARGS,
+                                                                   const float3 albedo,
+                                                                   const bool scatter,
+                                                                   ccl_global float *ccl_restrict
+                                                                       render_buffer)
+{
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+  const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, denoising_feature_throughput);
+
+  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+    /* Write view direction as normal. */
+    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+  }
+
+  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+    /* Write albedo. */
+    const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
   }
 }
 #endif /* __DENOISING_FEATURES__ */
 
-#ifdef __KERNEL_CPU__
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
-ccl_device_inline size_t kernel_write_id_pass_cpu(
-    float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (map) {
-    (*map)[id] += matte_weight;
-    return 0;
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, sd->object_flag)) {
+    return;
   }
-#else /* __KERNEL_CPU__ */
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight)
-ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer,
-                                                   size_t depth,
-                                                   float id,
-                                                   float matte_weight)
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+ccl_device_inline size_t kernel_write_id_pass(float *ccl_restrict buffer,
+                                              size_t depth,
+                                              float id,
+                                              float matte_weight)
 {
-#endif /* __KERNEL_CPU__ */
-  kernel_write_id_slots(buffer, depth, id, matte_weight);
-  return depth * 2;
+  kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
+  return depth * 4;
 }
 
-ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
-                                                ccl_global float *buffer,
-                                                PathRadiance *L,
-                                                ShaderData *sd,
-                                                ccl_addr_space PathState *state,
-                                                float3 throughput)
+ccl_device_inline void kernel_write_data_passes(INTEGRATOR_STATE_ARGS,
+                                                const ShaderData *sd,
+                                                ccl_global float *ccl_restrict render_buffer)
 {
 #ifdef __PASSES__
-  int path_flag = state->flag;
+  const int path_flag = INTEGRATOR_STATE(path, flag);
 
-  if (!(path_flag & PATH_RAY_CAMERA))
+  if (!(path_flag & PATH_RAY_CAMERA)) {
     return;
+  }
 
-  int flag = kernel_data.film.pass_flag;
-  int light_flag = kernel_data.film.light_pass_flag;
+  const int flag = kernel_data.film.pass_flag;
 
-  if (!((flag | light_flag) & PASS_ANY))
+  if (!(flag & PASS_ANY)) {
     return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
 
   if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
     if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
         average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
-      if (state->sample == 0) {
+      if (INTEGRATOR_STATE(path, sample) == 0) {
         if (flag & PASSMASK(DEPTH)) {
-          float depth = camera_z_depth(kg, sd->P);
+          const float depth = camera_z_depth(kg, sd->P);
           kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
         }
         if (flag & PASSMASK(OBJECT_ID)) {
-          float id = object_pass_id(kg, sd->object);
+          const float id = object_pass_id(kg, sd->object);
           kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
         }
         if (flag & PASSMASK(MATERIAL_ID)) {
-          float id = shader_pass_id(kg, sd);
+          const float id = shader_pass_id(kg, sd);
           kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
         }
       }
 
+      if (flag & PASSMASK(POSITION)) {
+        const float3 position = sd->P;
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+      }
       if (flag & PASSMASK(NORMAL)) {
-        float3 normal = shader_bsdf_average_normal(kg, sd);
+        const float3 normal = shader_bsdf_average_normal(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
       }
+      if (flag & PASSMASK(ROUGHNESS)) {
+        const float roughness = shader_bsdf_average_roughness(sd);
+        kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+      }
       if (flag & PASSMASK(UV)) {
-        float3 uv = primitive_uv(kg, sd);
+        const float3 uv = primitive_uv(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
       }
       if (flag & PASSMASK(MOTION)) {
-        float4 speed = primitive_motion_vector(kg, sd);
+        const float4 speed = primitive_motion_vector(kg, sd);
         kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
         kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
       }
 
-      state->flag |= PATH_RAY_SINGLE_PASS_DONE;
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
     }
   }
 
   if (kernel_data.film.cryptomatte_passes) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
     const float matte_weight = average(throughput) *
                                (1.0f - average(shader_bsdf_transparency(kg, sd)));
     if (matte_weight > 0.0f) {
       ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
       if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-        float id = object_cryptomatte_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+        const float id = object_cryptomatte_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-        float id = shader_cryptomatte_id(kg, sd->shader);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+        const float id = shader_cryptomatte_id(kg, sd->shader);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-        float id = object_cryptomatte_asset_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+        const float id = object_cryptomatte_asset_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
     }
   }
 
-  if (light_flag & PASSMASK_COMPONENT(DIFFUSE))
-    L->color_diffuse += shader_bsdf_diffuse(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(GLOSSY))
-    L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
-    L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-
-  if (light_flag & PASSMASK(MIST)) {
-    /* bring depth into 0..1 range */
-    float mist_start = kernel_data.film.mist_start;
-    float mist_inv_depth = kernel_data.film.mist_inv_depth;
+  if (flag & PASSMASK(DIFFUSE_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
+                             shader_bsdf_diffuse(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(GLOSSY_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
+                             shader_bsdf_glossy(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
+                             shader_bsdf_transmission(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(MIST)) {
+    /* Bring depth into 0..1 range. */
+    const float mist_start = kernel_data.film.mist_start;
+    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-    float depth = camera_distance(kg, sd->P);
+    const float depth = camera_distance(kg, sd->P);
     float mist = saturate((depth - mist_start) * mist_inv_depth);
 
-    /* falloff */
-    float mist_falloff = kernel_data.film.mist_falloff;
+    /* Falloff */
+    const float mist_falloff = kernel_data.film.mist_falloff;
 
     if (mist_falloff == 1.0f)
       ;
@@ -250,158 +309,17 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
     else
       mist = powf(mist, mist_falloff);
 
-    /* modulate by transparency */
-    float3 alpha = shader_bsdf_alpha(kg, sd);
-    L->mist += (1.0f - mist) * average(throughput * alpha);
-  }
-#endif
-}
+    /* Modulate by transparency */
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    const float3 alpha = shader_bsdf_alpha(kg, sd);
+    const float mist_output = (1.0f - mist) * average(throughput * alpha);
 
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 PathRadiance *L)
-{
-#ifdef __PASSES__
-  int light_flag = kernel_data.film.light_pass_flag;
-
-  if (!kernel_data.film.use_light_pass)
-    return;
-
-  if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
-                             L->indirect_transmission);
-  if (light_flag & PASSMASK(VOLUME_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
-  if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
-                             L->direct_transmission);
-  if (light_flag & PASSMASK(VOLUME_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
-
-  if (light_flag & PASSMASK(EMISSION))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
-  if (light_flag & PASSMASK(BACKGROUND))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
-  if (light_flag & PASSMASK(AO))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
-
-  if (light_flag & PASSMASK(DIFFUSE_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
-                             L->color_transmission);
-  if (light_flag & PASSMASK(SHADOW)) {
-    float3 shadow = L->shadow;
-    kernel_write_pass_float4(
-        buffer + kernel_data.film.pass_shadow,
-        make_float4(shadow.x, shadow.y, shadow.z, kernel_data.film.pass_shadow_scale));
+    /* Note that the final value in the render buffer we want is 1 - mist_output,
+     * to avoid having to tracking this in the Integrator state we do the negation
+     * after rendering. */
+    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
   }
-  if (light_flag & PASSMASK(MIST))
-    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
 #endif
 }
 
-ccl_device_inline void kernel_write_result(KernelGlobals *kg,
-                                           ccl_global float *buffer,
-                                           int sample,
-                                           PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
-  PROFILING_OBJECT(PRIM_NONE);
-
-  float alpha;
-  float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
-
-  if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
-  }
-
-  kernel_write_light_passes(kg, buffer, L);
-
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-#  ifdef __SHADOW_TRICKS__
-    kernel_write_denoising_shadow(kg,
-                                  buffer + kernel_data.film.pass_denoising_data,
-                                  sample,
-                                  average(L->path_total),
-                                  average(L->path_total_shaded));
-#  else
-    kernel_write_denoising_shadow(
-        kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
-#  endif
-    if (kernel_data.film.pass_denoising_clean) {
-      float3 noisy, clean;
-      path_radiance_split_denoising(kg, L, &noisy, &clean);
-      kernel_write_pass_float3_variance(
-          buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, noisy);
-      kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, clean);
-    }
-    else {
-      kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                            DENOISING_PASS_COLOR,
-                                        ensure_finite3(L_sum));
-    }
-
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_NORMAL,
-                                      L->denoising_normal);
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_ALBEDO,
-                                      L->denoising_albedo);
-    kernel_write_pass_float_variance(
-        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, L->denoising_depth);
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
-     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
-     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
-     tiles. */
-  if (kernel_data.film.pass_adaptive_aux_buffer &&
-      kernel_data.integrator.adaptive_threshold > 0.0f) {
-    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
-      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
-                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
-    }
-#ifdef __KERNEL_CPU__
-    if ((sample > kernel_data.integrator.adaptive_min_samples) &&
-        kernel_data.integrator.adaptive_stop_per_sample) {
-      const int step = kernel_data.integrator.adaptive_step;
-
-      if ((sample & (step - 1)) == (step - 1)) {
-        kernel_do_adaptive_stopping(kg, buffer, sample);
-      }
-    }
-#endif
-  }
-
-  /* Write the sample count as negative numbers initially to mark the samples as in progress.
-   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
-   * are scaled as if they were taken at a uniform sample count. */
-  if (kernel_data.film.pass_sample_count) {
-    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
-     * passes. */
-#ifdef __ATOMIC_PASS_WRITE__
-    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
-                               0x80000000);
-#else
-    if (buffer[kernel_data.film.pass_sample_count] > 0) {
-      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
-    }
-#endif
-    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
-  }
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
deleted file mode 100644
index 92a097de9e1..00000000000
--- a/intern/cycles/kernel/kernel_path.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-// clang-format off
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_passes.h"
-
-#if defined(__VOLUME__) || defined(__SUBSURFACE__)
-#  include "kernel/kernel_volume.h"
-#endif
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
-#endif
-
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
-                                                        ccl_addr_space PathState *state,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        PathRadiance *L,
-                                                        const int last_object)
-{
-  PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
-
-  uint visibility = path_state_ray_visibility(kg, state);
-
-  if (path_state_ao_bounce(kg, state)) {
-    ray->t = kernel_data.background.ao_distance;
-    if (last_object != OBJECT_NONE) {
-      const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
-      if (object_ao_distance != 0.0f) {
-        ray->t = object_ao_distance;
-      }
-    }
-  }
-
-  bool hit = scene_intersect(kg, ray, visibility, isect);
-
-  return hit;
-}
-
-ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg,
-                                                      ccl_addr_space PathState *state,
-                                                      Ray *ray,
-                                                      float3 throughput,
-                                                      ccl_addr_space Intersection *isect,
-                                                      ShaderData *emission_sd,
-                                                      PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
-
-#ifdef __LAMP_MIS__
-  if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-    /* ray starting from previous non-transparent bounce */
-    Ray light_ray ccl_optional_struct_init;
-
-    light_ray.P = ray->P - state->ray_t * ray->D;
-    state->ray_t += isect->t;
-    light_ray.D = ray->D;
-    light_ray.t = state->ray_t;
-    light_ray.time = ray->time;
-    light_ray.dD = ray->dD;
-    light_ray.dP = ray->dP;
-
-    /* intersect with lamp */
-    indirect_lamp_emission(kg, emission_sd, state, L, &light_ray, throughput);
-  }
-#endif /* __LAMP_MIS__ */
-}
-
-ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_addr_space Ray *ray,
-                                                   float3 throughput,
-                                                   ShaderData *sd,
-                                                   ccl_global float *buffer,
-                                                   PathRadiance *L)
-{
-  /* eval background shader if nothing hit */
-  if (kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    L->transparent += average(throughput);
-
-#ifdef __PASSES__
-    if (!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)))
-#endif /* __PASSES__ */
-      return;
-  }
-
-  /* When using the ao bounces approximation, adjust background
-   * shader intensity with ao factor. */
-  if (path_state_ao_bounce(kg, state)) {
-    throughput *= kernel_data.background.ao_bounces_factor;
-  }
-
-#ifdef __BACKGROUND__
-  /* sample background shader */
-  float3 L_background = indirect_background(kg, sd, state, buffer, ray);
-  path_radiance_accum_background(kg, L, state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  ifdef __VOLUME__
-ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *kg,
-                                                                ShaderData *sd,
-                                                                PathState *state,
-                                                                Ray *ray,
-                                                                float3 *throughput,
-                                                                ccl_addr_space Intersection *isect,
-                                                                bool hit,
-                                                                ShaderData *emission_sd,
-                                                                PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_VOLUME);
-
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return VOLUME_PATH_ATTENUATED;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-#    ifdef __VOLUME_DECOUPLED__
-  int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-  bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-  bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
-
-  if (decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    volume_segment.sampling_method = sampling_method;
-
-    /* emission */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-
-    /* scattering */
-    VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      int all = kernel_data.integrator.sample_all_lights_indirect;
-
-      /* direct light sampling */
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect sample. if we use distance sampling and take just
-       * one sample for direct and indirect light, we could share
-       * this computation, but makes code a bit complex */
-      float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-      float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-      result = kernel_volume_decoupled_scatter(
-          kg, state, &volume_ray, sd, throughput, rphase, rscatter, &volume_segment, NULL, true);
-    }
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-
-    if (result == VOLUME_PATH_SCATTERED) {
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-    else {
-      *throughput *= volume_segment.accum_transmittance;
-    }
-  }
-  else
-#    endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#    ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      /* indirect light bounce */
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-#    endif /* __VOLUME_SCATTER__ */
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-#  endif /* __VOLUME__ */
-
-#endif /* __SPLIT_KERNEL__ */
-
-ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
-                                                     ShaderData *sd,
-                                                     ccl_addr_space PathState *state,
-                                                     ccl_addr_space Ray *ray,
-                                                     float3 throughput,
-                                                     ShaderData *emission_sd,
-                                                     PathRadiance *L,
-                                                     ccl_global float *buffer)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
-
-#ifdef __SHADOW_TRICKS__
-  if (sd->object_flag & SD_OBJECT_SHADOW_CATCHER) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
-      state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO);
-
-      float3 bg = zero_float3();
-      if (!kernel_data.background.transparent) {
-        bg = indirect_background(kg, emission_sd, state, NULL, ray);
-      }
-      path_radiance_accum_shadowcatcher(L, throughput, bg);
-    }
-  }
-  else if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    /* Only update transparency after shadow catcher bounce. */
-    L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd));
-  }
-#endif /* __SHADOW_TRICKS__ */
-
-  /* holdout */
-#ifdef __HOLDOUT__
-  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-      (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    const float3 holdout_weight = shader_holdout_apply(kg, sd);
-    if (kernel_data.background.transparent) {
-      L->transparent += average(holdout_weight * throughput);
-    }
-    if (isequal_float3(holdout_weight, one_float3())) {
-      return false;
-    }
-  }
-#endif /* __HOLDOUT__ */
-
-  /* holdout mask objects do not write data passes */
-  kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
-
-  /* blurring of bsdf after bounces, for rays that have a small likelihood
-   * of following this particular path (diffuse, rough glossy) */
-  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
-    float blur_pdf = kernel_data.integrator.filter_glossy * state->min_ray_pdf;
-
-    if (blur_pdf < 1.0f) {
-      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
-      shader_bsdf_blur(kg, sd, blur_roughness);
-    }
-  }
-
-#ifdef __EMISSION__
-  /* emission */
-  if (sd->flag & SD_EMISSION) {
-    float3 emission = indirect_primitive_emission(
-        kg, sd, sd->ray_length, state->flag, state->ray_pdf);
-    path_radiance_accum_emission(kg, L, state, throughput, emission);
-  }
-#endif /* __EMISSION__ */
-
-  return true;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    void
-    kernel_path_ao(KernelGlobals *kg,
-                   ShaderData *sd,
-                   ShaderData *emission_sd,
-                   PathRadiance *L,
-                   ccl_addr_space PathState *state,
-                   float3 throughput,
-                   float3 ao_alpha)
-{
-  PROFILING_INIT(kg, PROFILING_AO);
-
-  /* todo: solve correlation */
-  float bsdf_u, bsdf_v;
-
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_D;
-  float ao_pdf;
-
-  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray light_ray;
-    float3 ao_shadow;
-
-    light_ray.P = ray_offset(sd->P, sd->Ng);
-    light_ray.D = ao_D;
-    light_ray.t = kernel_data.background.ao_distance;
-    light_ray.time = sd->time;
-    light_ray.dP = sd->dP;
-    light_ray.dD = differential3_zero();
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-      path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
-    }
-    else {
-      path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
-    }
-  }
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  if defined(__BRANCHED_PATH__) || defined(__BAKING__)
-
-ccl_device void kernel_path_indirect(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ShaderData *emission_sd,
-                                     Ray *ray,
-                                     float3 throughput,
-                                     PathState *state,
-                                     PathRadiance *L,
-                                     const int last_object)
-{
-#    ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#    endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, last_object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L);
-
-#    ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#    endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, sd, NULL, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-      if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, sd, state, NULL, state->flag);
-        shader_prepare_closures(sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, NULL)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#    ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, sd, state, L);
-#    endif
-
-#    ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, sd, emission_sd, L, state, throughput, zero_float3());
-        }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd->flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#    endif /* __SUBSURFACE__ */
-
-#    if defined(__EMISSION__)
-        int all = (kernel_data.integrator.sample_all_lights_indirect) ||
-                  (state->flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#    endif /* defined(__EMISSION__) */
-
-#    ifdef __VOLUME__
-      }
-#    endif
-
-      if (!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#    ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#    endif /* __SUBSURFACE__ */
-}
-
-#  endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-
-ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
-                                                  PathState *state,
-                                                  float3 throughput,
-                                                  Ray *ray,
-                                                  PathRadiance *L,
-                                                  ccl_global float *buffer,
-                                                  ShaderData *emission_sd)
-{
-  PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
-
-  /* Shader data memory used for both volumes and surfaces, saves stack space. */
-  ShaderData sd;
-
-#  ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#  endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, sd.object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
-
-#  ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, &sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#  endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, &sd, buffer, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, &sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#  ifdef __VOLUME__
-      if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, &sd, state, buffer, state->flag);
-        shader_prepare_closures(&sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, &sd, state, ray, throughput, emission_sd, L, buffer)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#  ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, &sd, state, L);
-#  endif
-
-#  ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
-        }
-#  endif /* __AO__ */
-
-#  ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd.flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#  endif /* __SUBSURFACE__ */
-
-#  ifdef __EMISSION__
-        /* direct lighting */
-        kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
-#  endif /* __EMISSION__ */
-
-#  ifdef __VOLUME__
-      }
-#  endif
-
-      /* compute direct lighting and next bounce */
-      if (!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#  ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#  endif /* __SUBSURFACE__ */
-}
-
-ccl_device void kernel_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
-
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* Initialize random numbers and sample ray. */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  if (ray.t == 0.0f) {
-    return;
-  }
-
-  /* Initialize state. */
-  float3 throughput = one_float3();
-
-  PathRadiance L;
-  path_radiance_init(kg, &L);
-
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-#  ifdef __KERNEL_OPTIX__
-  /* Force struct into local memory to avoid costly spilling on trace calls. */
-  if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
-    for (int i = 0; i < sizeof(L); ++i)
-      reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
-#  endif
-
-  /* Integrate. */
-  kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);
-
-  kernel_write_result(kg, buffer, sample, &L);
-}
-
-#endif /* __SPLIT_KERNEL__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
deleted file mode 100644
index a1ee1bc107e..00000000000
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *emission_sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space PathState *state,
-                                               float3 throughput)
-{
-  int num_samples = kernel_data.integrator.ao_samples;
-  float num_samples_inv = 1.0f / num_samples;
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-  for (int j = 0; j < num_samples; j++) {
-    float bsdf_u, bsdf_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-    float3 ao_D;
-    float ao_pdf;
-
-    sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-    if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-      Ray light_ray;
-      float3 ao_shadow;
-
-      light_ray.P = ray_offset(sd->P, sd->Ng);
-      light_ray.D = ao_D;
-      light_ray.t = kernel_data.background.ao_distance;
-      light_ray.time = sd->time;
-      light_ray.dP = sd->dP;
-      light_ray.dD = differential3_zero();
-
-      if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-        path_radiance_accum_ao(
-            kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
-      }
-      else {
-        path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf);
-      }
-    }
-  }
-}
-
-#  ifndef __SPLIT_KERNEL__
-
-#    ifdef __VOLUME__
-ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 *throughput,
-                                                        ccl_addr_space Intersection *isect,
-                                                        bool hit,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L)
-{
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  const int object = sd->object;
-
-#      ifdef __VOLUME_DECOUPLED__
-  /* decoupled ray marching only supported on CPU */
-  if (kernel_data.integrator.volume_decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    /* direct light sampling */
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-
-      int all = kernel_data.integrator.sample_all_lights_direct;
-
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect light sampling */
-      int num_samples = kernel_data.integrator.volume_samples;
-      float num_samples_inv = 1.0f / num_samples;
-
-      for (int j = 0; j < num_samples; j++) {
-        PathState ps = *state;
-        Ray pray = *ray;
-        float3 tp = *throughput;
-
-        /* branch RNG state */
-        path_state_branch(&ps, j, num_samples);
-
-        /* scatter sample. if we use distance sampling and take just one
-         * sample for direct and indirect light, we could share this
-         * computation, but makes code a bit complex */
-        float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
-        float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(
-            kg, &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-        if (result == VOLUME_PATH_SCATTERED &&
-            kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(
-              kg, indirect_sd, emission_sd, &pray, tp * num_samples_inv, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-    }
-
-    /* emission and transmittance */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-    *throughput *= volume_segment.accum_transmittance;
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-  }
-  else
-#      endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* GPU: no decoupled ray marching, scatter probabilistically. */
-    int num_samples = kernel_data.integrator.volume_samples;
-    float num_samples_inv = 1.0f / num_samples;
-
-    /* todo: we should cache the shader evaluations from stepping
-     * through the volume, for now we redo them multiple times */
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      Ray pray = *ray;
-      float3 tp = (*throughput) * num_samples_inv;
-
-      /* branch RNG state */
-      path_state_branch(&ps, j, num_samples);
-
-      VolumeIntegrateResult result = kernel_volume_integrate(
-          kg, &ps, sd, &volume_ray, L, &tp, step_size);
-
-#      ifdef __VOLUME_SCATTER__
-      if (result == VOLUME_PATH_SCATTERED) {
-        /* todo: support equiangular, MIS and all light sampling.
-         * alternatively get decoupled ray marching working on the GPU */
-        kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
-
-        if (kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(kg, indirect_sd, emission_sd, &pray, tp, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-#      endif /* __VOLUME_SCATTER__ */
-    }
-
-    /* todo: avoid this calculation using decoupled ray marching */
-    kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
-  }
-}
-#    endif /* __VOLUME__ */
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-                                                                         ShaderData *sd,
-                                                                         ShaderData *indirect_sd,
-                                                                         ShaderData *emission_sd,
-                                                                         float3 throughput,
-                                                                         float num_samples_adjust,
-                                                                         PathState *state,
-                                                                         PathRadiance *L)
-{
-  float sum_sample_weight = 0.0f;
-#    ifdef __DENOISING_FEATURES__
-  if (state->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#    endif /* __DENOISING_FEATURES__ */
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    /* transparency is not handled here, but in outer loop */
-    if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-      continue;
-    }
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      float3 tp = throughput;
-      Ray bsdf_ray;
-#    ifdef __SHADOW_TRICKS__
-      float shadow_transparency = L->shadow_transparency;
-#    endif
-
-      ps.rng_hash = cmj_hash(state->rng_hash, i);
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, &tp, &ps, &L->state, &bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps.rng_hash = state->rng_hash;
-
-      kernel_path_indirect(
-          kg, indirect_sd, emission_sd, &bsdf_ray, tp * num_samples_inv, &ps, L, sd->object);
-
-      /* for render passes, sum and reset indirect light pass variables
-       * for the next samples */
-      path_radiance_sum_indirect(L);
-      path_radiance_reset_indirect(L);
-
-#    ifdef __SHADOW_TRICKS__
-      L->shadow_transparency = shadow_transparency;
-#    endif
-    }
-  }
-}
-
-#    ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* set up random number generator */
-    uint lcg_state = lcg_state_init(state, 0x68bc21eb);
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = 0; j < num_samples; j++) {
-      PathState hit_state = *state;
-      path_state_branch(&hit_state, j, num_samples);
-      hit_state.rng_hash = bssrdf_rng_hash;
-
-      LocalIntersection ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-      int num_hits = subsurface_scatter_multi_intersect(
-          kg, &ss_isect, sd, &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-      hit_state.rng_offset += PRNG_BOUNCE_NUM;
-
-#      ifdef __VOLUME__
-      Ray volume_ray = *ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#      endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = 0; hit < num_hits; hit++) {
-        ShaderData bssrdf_sd = *sd;
-        Bssrdf *bssrdf = (Bssrdf *)sc;
-        ClosureType bssrdf_type = sc->type;
-        float bssrdf_roughness = bssrdf->roughness;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect, hit, &bssrdf_sd, &hit_state, bssrdf_type, bssrdf_roughness);
-
-#      ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state.volume_stack[k] = state->volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state.volume_stack);
-        }
-#      endif /* __VOLUME__ */
-
-#      ifdef __EMISSION__
-        /* direct light */
-        if (kernel_data.integrator.use_direct_light) {
-          int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                    (hit_state.flag & PATH_RAY_SHADOW_CATCHER);
-          kernel_branched_path_surface_connect_light(
-              kg, &bssrdf_sd, emission_sd, &hit_state, throughput, num_samples_inv, L, all);
-        }
-#      endif /* __EMISSION__ */
-
-        /* indirect light */
-        kernel_branched_path_surface_indirect_light(
-            kg, &bssrdf_sd, indirect_sd, emission_sd, throughput, num_samples_inv, &hit_state, L);
-      }
-    }
-  }
-}
-#    endif /* __SUBSURFACE__ */
-
-ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
-                                               uint rng_hash,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer,
-                                               PathRadiance *L)
-{
-  /* initialize */
-  float3 throughput = one_float3();
-
-  path_radiance_init(kg, L);
-
-  /* shader data memory used for both volumes and surfaces, saves stack space */
-  ShaderData sd;
-  /* shader data used by emission, shadows, volume stacks, indirect path */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-  /* Main Loop
-   * Here we only handle transparency intersections from the camera ray.
-   * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
-   */
-  for (;;) {
-    /* Find intersection with objects in scene. */
-    Intersection isect;
-    bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L, sd.object);
-
-#    ifdef __VOLUME__
-    /* Volume integration. */
-    kernel_branched_path_volume(
-        kg, &sd, &state, &ray, &throughput, &isect, hit, &indirect_sd, emission_sd, L);
-#    endif /* __VOLUME__ */
-
-    /* Shade background. */
-    if (!hit) {
-      kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L);
-      break;
-    }
-
-    /* Setup and evaluate shader. */
-    shader_setup_from_ray(kg, &sd, &isect, &ray);
-
-    /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-    if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-      shader_eval_surface(kg, &sd, &state, buffer, state.flag);
-      shader_merge_closures(&sd);
-
-      /* Apply shadow catcher, holdout, emission. */
-      if (!kernel_path_shader_apply(kg, &sd, &state, &ray, throughput, emission_sd, L, buffer)) {
-        break;
-      }
-
-      /* transparency termination */
-      if (state.flag & PATH_RAY_TRANSPARENT) {
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, &state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-      }
-
-#    ifdef __DENOISING_FEATURES__
-      kernel_update_denoising_features(kg, &sd, &state, L);
-#    endif
-
-#    ifdef __AO__
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
-      }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-      /* bssrdf scatter to a different location on the same object */
-      if (sd.flag & SD_BSSRDF) {
-        kernel_branched_path_subsurface_scatter(
-            kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-      }
-#    endif /* __SUBSURFACE__ */
-
-      PathState hit_state = state;
-
-#    ifdef __EMISSION__
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                  (state.flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, &sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
-      }
-#    endif /* __EMISSION__ */
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
-
-      /* continue in case of transparency */
-      throughput *= shader_bsdf_transparency(kg, &sd);
-
-      if (is_zero(throughput))
-        break;
-
-      /* Update Path State */
-      path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-#    ifdef __VOLUME__
-    }
-    else {
-      if (!path_state_volume_next(kg, &state)) {
-        break;
-      }
-    }
-#    endif
-
-    ray.P = ray_offset(sd.P, -sd.Ng);
-    ray.t -= sd.ray_length; /* clipping works through transparent */
-
-#    ifdef __RAY_DIFFERENTIALS__
-    ray.dP = sd.dP;
-    ray.dD.dx = -sd.dI.dx;
-    ray.dD.dy = -sd.dI.dy;
-#    endif /* __RAY_DIFFERENTIALS__ */
-
-#    ifdef __VOLUME__
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#    endif /* __VOLUME__ */
-  }
-}
-
-ccl_device void kernel_branched_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* initialize random numbers and ray */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  /* integrate */
-  PathRadiance L;
-
-  if (ray.t != 0.0f) {
-    kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
-    kernel_write_result(kg, buffer, sample, &L);
-  }
-}
-
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
deleted file mode 100644
index 815767595a9..00000000000
--- a/intern/cycles/kernel/kernel_path_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_hash.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_path_trace_setup(
-    KernelGlobals *kg, int sample, int x, int y, uint *rng_hash, ccl_addr_space Ray *ray)
-{
-  float filter_u;
-  float filter_v;
-
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
-
-  /* sample camera ray */
-
-  float lens_u = 0.0f, lens_v = 0.0f;
-
-  if (kernel_data.cam.aperturesize > 0.0f)
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-  float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-  if (kernel_data.cam.shuttertime != -1.0f)
-    time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
-#endif
-
-  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index bf601580cd0..ebb2c0df4f1 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -14,99 +14,116 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
+#pragma once
 
-ccl_device_inline void path_state_init(KernelGlobals *kg,
-                                       ShaderData *stack_sd,
-                                       ccl_addr_space PathState *state,
-                                       uint rng_hash,
-                                       int sample,
-                                       ccl_addr_space Ray *ray)
-{
-  state->flag = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND;
+#include "kernel_random.h"
 
-  state->rng_hash = rng_hash;
-  state->rng_offset = PRNG_BASE_NUM;
-  state->sample = sample;
-  state->num_samples = kernel_data.integrator.aa_samples;
-  state->branch_factor = 1.0f;
+CCL_NAMESPACE_BEGIN
 
-  state->bounce = 0;
-  state->diffuse_bounce = 0;
-  state->glossy_bounce = 0;
-  state->transmission_bounce = 0;
-  state->transparent_bounce = 0;
+/* Initialize queues, so that the this path is considered terminated.
+ * Used for early outputs in the camera ray initialization, as well as initialization of split
+ * states for shadow catcher. */
+ccl_device_inline void path_state_init_queues(INTEGRATOR_STATE_ARGS)
+{
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
 
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-    state->flag |= PATH_RAY_STORE_SHADOW_INFO;
-    state->denoising_feature_weight = 1.0f;
-    state->denoising_feature_throughput = one_float3();
-  }
-  else {
-    state->denoising_feature_weight = 0.0f;
-    state->denoising_feature_throughput = zero_float3();
-  }
-#endif /* __DENOISING_FEATURES__ */
+/* Minimalistic initialization of the path state, which is needed for early outputs in the
+ * integrator initialization to work. */
+ccl_device_inline void path_state_init(INTEGRATOR_STATE_ARGS,
+                                       const ccl_global KernelWorkTile *ccl_restrict tile,
+                                       const int x,
+                                       const int y)
+{
+  const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride;
 
-  state->min_ray_pdf = FLT_MAX;
-  state->ray_pdf = 0.0f;
-#ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#endif
+  INTEGRATOR_STATE_WRITE(path, render_pixel_index) = render_pixel_index;
 
-#ifdef __VOLUME__
-  state->volume_bounce = 0;
-  state->volume_bounds_bounce = 0;
+  path_state_init_queues(INTEGRATOR_STATE_PASS);
+}
 
-  if (kernel_data.integrator.use_volumes) {
-    /* Initialize volume stack with volume we are inside of. */
-    kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
+/* Initialize the rest of the path state needed to continue the path integration. */
+ccl_device_inline void path_state_init_integrator(INTEGRATOR_STATE_ARGS,
+                                                  const int sample,
+                                                  const uint rng_hash)
+{
+  INTEGRATOR_STATE_WRITE(path, sample) = sample;
+  INTEGRATOR_STATE_WRITE(path, bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, glossy_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transmission_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, rng_hash) = rng_hash;
+  INTEGRATOR_STATE_WRITE(path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
+                                       PATH_RAY_TRANSPARENT_BACKGROUND;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = kernel_data.background.volume_shader;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
   }
-  else {
-    state->volume_stack[0].shader = SHADER_NONE;
+
+#ifdef __DENOISING_FEATURES__
+  if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
+    INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_DENOISING_FEATURES;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) = one_float3();
   }
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg,
-                                       ccl_addr_space PathState *state,
-                                       int label)
+ccl_device_inline void path_state_next(INTEGRATOR_STATE_ARGS, int label)
 {
+  uint32_t flag = INTEGRATOR_STATE(path, flag);
+
   /* ray through transparent keeps same flags from previous ray and is
    * not counted as a regular bounce, transparent has separate max */
   if (label & LABEL_TRANSPARENT) {
-    state->flag |= PATH_RAY_TRANSPARENT;
-    state->transparent_bounce++;
-    if (state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_IMMEDIATE;
+    uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+
+    flag |= PATH_RAY_TRANSPARENT;
+    if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+      flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
     }
 
     if (!kernel_data.integrator.transparent_shadows)
-      state->flag |= PATH_RAY_MIS_SKIP;
-
-    /* random number generator next bounce */
-    state->rng_offset += PRNG_BOUNCE_NUM;
+      flag |= PATH_RAY_MIS_SKIP;
 
+    INTEGRATOR_STATE_WRITE(path, flag) = flag;
+    INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+    /* Random number generator next bounce. */
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
     return;
   }
 
-  state->bounce++;
-  if (state->bounce >= kernel_data.integrator.max_bounce) {
-    state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+  uint32_t bounce = INTEGRATOR_STATE(path, bounce) + 1;
+  if (bounce >= kernel_data.integrator.max_bounce) {
+    flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
   }
 
-  state->flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
 
 #ifdef __VOLUME__
   if (label & LABEL_VOLUME_SCATTER) {
     /* volume scatter */
-    state->flag |= PATH_RAY_VOLUME_SCATTER;
-    state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    flag |= PATH_RAY_VOLUME_SCATTER;
+    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    if (bounce == 1) {
+      flag |= PATH_RAY_VOLUME_PASS;
+    }
 
-    state->volume_bounce++;
-    if (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    const int volume_bounce = INTEGRATOR_STATE(path, volume_bounce) + 1;
+    INTEGRATOR_STATE_WRITE(path, volume_bounce) = volume_bounce;
+    if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+      flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
     }
   }
   else
@@ -114,163 +131,237 @@ ccl_device_inline void path_state_next(KernelGlobals *kg,
   {
     /* surface reflection/transmission */
     if (label & LABEL_REFLECT) {
-      state->flag |= PATH_RAY_REFLECT;
-      state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+      flag |= PATH_RAY_REFLECT;
+      flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
 
       if (label & LABEL_DIFFUSE) {
-        state->diffuse_bounce++;
-        if (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int diffuse_bounce = INTEGRATOR_STATE(path, diffuse_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = diffuse_bounce;
+        if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
       else {
-        state->glossy_bounce++;
-        if (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int glossy_bounce = INTEGRATOR_STATE(path, glossy_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, glossy_bounce) = glossy_bounce;
+        if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
     }
     else {
       kernel_assert(label & LABEL_TRANSMIT);
 
-      state->flag |= PATH_RAY_TRANSMIT;
+      flag |= PATH_RAY_TRANSMIT;
 
       if (!(label & LABEL_TRANSMIT_TRANSPARENT)) {
-        state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+        flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
       }
 
-      state->transmission_bounce++;
-      if (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
-        state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+      const int transmission_bounce = INTEGRATOR_STATE(path, transmission_bounce) + 1;
+      INTEGRATOR_STATE_WRITE(path, transmission_bounce) = transmission_bounce;
+      if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+        flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
       }
     }
 
     /* diffuse/glossy/singular */
     if (label & LABEL_DIFFUSE) {
-      state->flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
+      flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
     }
     else if (label & LABEL_GLOSSY) {
-      state->flag |= PATH_RAY_GLOSSY;
+      flag |= PATH_RAY_GLOSSY;
     }
     else {
       kernel_assert(label & LABEL_SINGULAR);
-      state->flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+      flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+    }
+
+    /* Render pass categories. */
+    if (bounce == 1) {
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
     }
   }
 
-  /* random number generator next bounce */
-  state->rng_offset += PRNG_BOUNCE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = flag;
+  INTEGRATOR_STATE_WRITE(path, bounce) = bounce;
 
-#ifdef __DENOISING_FEATURES__
-  if ((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
-    state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
-  }
-#endif
+  /* Random number generator next bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
 }
 
 #ifdef __VOLUME__
-ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state)
+ccl_device_inline bool path_state_volume_next(INTEGRATOR_STATE_ARGS)
 {
   /* For volume bounding meshes we pass through without counting transparent
    * bounces, only sanity check in case self intersection gets us stuck. */
-  state->volume_bounds_bounce++;
-  if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+  uint32_t volume_bounds_bounce = INTEGRATOR_STATE(path, volume_bounds_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = volume_bounds_bounce;
+  if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
     return false;
   }
 
   /* Random number generator next bounce. */
-  if (state->volume_bounds_bounce > 1) {
-    state->rng_offset += PRNG_BOUNCE_NUM;
+  if (volume_bounds_bounce > 1) {
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
   }
 
   return true;
 }
 #endif
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state)
+ccl_device_inline uint path_state_ray_visibility(INTEGRATOR_STATE_CONST_ARGS)
 {
-  uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
 
-  /* for visibility, diffuse/glossy are for reflection only */
-  if (flag & PATH_RAY_TRANSMIT)
-    flag &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
-  /* todo: this is not supported as its own ray visibility yet */
-  if (state->flag & PATH_RAY_VOLUME_SCATTER)
-    flag |= PATH_RAY_DIFFUSE;
+  uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY;
 
-  return flag;
+  /* For visibility, diffuse/glossy are for reflection only. */
+  if (visibility & PATH_RAY_TRANSMIT) {
+    visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
+  }
+
+  /* todo: this is not supported as its own ray visibility yet. */
+  if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+    visibility |= PATH_RAY_DIFFUSE;
+  }
+
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+
+  return visibility;
 }
 
-ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
-                                                            ccl_addr_space PathState *state,
-                                                            const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(INTEGRATOR_STATE_CONST_ARGS,
+                                                            const uint32_t path_flag)
 {
-  if (state->flag & PATH_RAY_TERMINATE_IMMEDIATE) {
-    /* Ray is to be terminated immediately. */
-    return 0.0f;
-  }
-  else if (state->flag & PATH_RAY_TRANSPARENT) {
+  if (path_flag & PATH_RAY_TRANSPARENT) {
+    const uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
-      return 1.0f;
-    }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+    if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
       return 1.0f;
     }
-#endif
   }
   else {
+    const uint32_t bounce = INTEGRATOR_STATE(path, bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->bounce <= kernel_data.integrator.min_bounce) {
+    if (bounce <= kernel_data.integrator.min_bounce) {
       return 1.0f;
     }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
-      return 1.0f;
-    }
-#endif
   }
 
   /* Probabilistic termination: use sqrt() to roughly match typical view
    * transform and do path termination a bit later on average. */
-  return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
+  return min(sqrtf(max3(fabs(INTEGRATOR_STATE(path, throughput)))), 1.0f);
 }
 
-/* TODO(DingTo): Find more meaningful name for this */
-ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, bool increase)
+ccl_device_inline bool path_state_ao_bounce(INTEGRATOR_STATE_CONST_ARGS)
 {
-  /* Modify bounce temporarily for shader eval */
-  if (increase)
-    state->bounce += 1;
-  else
-    state->bounce -= 1;
-}
-
-ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
-{
-  if (state->bounce <= kernel_data.integrator.ao_bounces) {
+  if (!kernel_data.integrator.ao_bounces) {
     return false;
   }
 
-  int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+  const int bounce = INTEGRATOR_STATE(path, bounce) - INTEGRATOR_STATE(path, transmission_bounce) -
+                     (INTEGRATOR_STATE(path, glossy_bounce) > 0) + 1;
   return (bounce > kernel_data.integrator.ao_bounces);
 }
 
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
-                                         int branch,
-                                         int num_branches)
+/* Random Number Sampling Utility Functions
+ *
+ * For each random number in each step of the path we must have a unique
+ * dimension to avoid using the same sequence twice.
+ *
+ * For branches in the path we must be careful not to reuse the same number
+ * in a sequence and offset accordingly.
+ */
+
+/* RNG State loaded onto stack. */
+typedef struct RNGState {
+  uint rng_hash;
+  uint rng_offset;
+  int sample;
+} RNGState;
+
+ccl_device_inline void path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset);
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline void shadow_path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  const uint shadow_bounces = INTEGRATOR_STATE(shadow_path, transparent_bounce) -
+                              INTEGRATOR_STATE(path, transparent_bounce);
+
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset) + PRNG_BOUNCE_NUM * shadow_bounces;
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline float path_state_rng_1D(const KernelGlobals *kg,
+                                          const RNGState *rng_state,
+                                          int dimension)
+{
+  return path_rng_1D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(
+    const KernelGlobals *kg, const RNGState *rng_state, int dimension, float *fx, float *fy)
+{
+  path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
+}
+
+ccl_device_inline float path_state_rng_1D_hash(const KernelGlobals *kg,
+                                               const RNGState *rng_state,
+                                               uint hash)
+{
+  /* Use a hash instead of dimension, this is not great but avoids adding
+   * more dimensions to each bounce which reduces quality of dimensions we
+   * are already using. */
+  return path_rng_1D(
+      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+}
+
+ccl_device_inline float path_branched_rng_1D(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             int branch,
+                                             int num_branches,
+                                             int dimension)
+{
+  return path_rng_1D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(const KernelGlobals *kg,
+                                            const RNGState *rng_state,
+                                            int branch,
+                                            int num_branches,
+                                            int dimension,
+                                            float *fx,
+                                            float *fy)
+{
+  path_rng_2D(kg,
+              rng_state->rng_hash,
+              rng_state->sample * num_branches + branch,
+              rng_state->rng_offset + dimension,
+              fx,
+              fy);
+}
+
+/* Utility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(const KernelGlobals *kg,
+                                                         const RNGState *state)
 {
-  if (num_branches > 1) {
-    /* Path is splitting into a branch, adjust so that each branch
-     * still gets a unique sample from the same sequence. */
-    state->sample = state->sample * num_branches + branch;
-    state->num_samples = state->num_samples * num_branches;
-    state->branch_factor *= num_branches;
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
   }
+  return 0.0f;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
deleted file mode 100644
index 97d3f292ca3..00000000000
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    bool
-    kernel_path_subsurface_scatter(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   ShaderData *emission_sd,
-                                   PathRadiance *L,
-                                   ccl_addr_space PathState *state,
-                                   ccl_addr_space Ray *ray,
-                                   ccl_addr_space float3 *throughput,
-                                   ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  PROFILING_INIT(kg, PROFILING_SUBSURFACE);
-
-  float bssrdf_u, bssrdf_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-  const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
-
-  /* do bssrdf scatter step if we picked a bssrdf closure */
-  if (sc) {
-    /* We should never have two consecutive BSSRDF bounces,
-     * the second one should be converted to a diffuse BSDF to
-     * avoid this.
-     */
-    kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
-
-    uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
-
-    LocalIntersection ss_isect;
-    int num_hits = subsurface_scatter_multi_intersect(
-        kg, &ss_isect, sd, state, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-#  ifdef __VOLUME__
-    bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                    sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* compute lighting with the BSDF closure */
-    for (int hit = 0; hit < num_hits; hit++) {
-      /* NOTE: We reuse the existing ShaderData, we assume the path
-       * integration loop stops when this function returns true.
-       */
-      subsurface_scatter_multi_setup(kg, &ss_isect, hit, sd, state, bssrdf_type, bssrdf_roughness);
-
-      kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-      ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-      ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-      PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
-
-      *hit_state = *state;
-      *hit_ray = *ray;
-      *hit_tp = *throughput;
-      *hit_L_state = L->state;
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-      if (kernel_path_surface_bounce(kg, sd, hit_tp, hit_state, hit_L_state, hit_ray)) {
-#  ifdef __LAMP_MIS__
-        hit_state->ray_t = 0.0f;
-#  endif /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          Ray volume_ray = *ray;
-          /* Setup ray from previous surface point to the new one. */
-          volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, &volume_ray.t);
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-        ss_indirect->num_rays++;
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-    KernelGlobals *kg,
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
-    ccl_addr_space PathState *state,
-    ccl_addr_space Ray *ray,
-    PathRadiance *L,
-    ccl_addr_space float3 *throughput)
-{
-  /* Setup state, ray and throughput for indirect SSS rays. */
-  ss_indirect->num_rays--;
-
-  path_radiance_sum_indirect(L);
-  path_radiance_reset_indirect(L);
-
-  *state = ss_indirect->state[ss_indirect->num_rays];
-  *ray = ss_indirect->rays[ss_indirect->num_rays];
-  L->state = ss_indirect->L_state[ss_indirect->num_rays];
-  *throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-  state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif /* __SUBSURFACE__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
deleted file mode 100644
index ba48c0bdfc4..00000000000
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \
-    defined(__BAKING__)
-/* branched path tracing: connect path directly to position on one or more lights and add it to L
- */
-ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ShaderData *emission_sd,
-    ccl_addr_space PathState *state,
-    float3 throughput,
-    float num_samples_adjust,
-    PathRadiance *L,
-    int sample_all_lights)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 0;
-  if (kernel_data.integrator.use_direct_light) {
-    if (sample_all_lights) {
-      num_lights = kernel_data.integrator.num_all_lights;
-      if (kernel_data.integrator.pdf_triangles != 0.0f) {
-        num_lights += 1;
-      }
-    }
-    else {
-      num_lights = 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; i++) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = ceil_to_int(num_samples_adjust * light_select_num_samples(kg, i));
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = ceil_to_int(num_samples_adjust * kernel_data.integrator.mesh_light_samples);
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = num_samples_adjust / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#    ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#    endif
-      bool has_emission = false;
-
-      if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-        float terminate = path_branched_rng_light_termination(
-            kg, lamp_rng_hash, state, j, num_samples);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-          /* The sampling probability returned by lamp_light_sample assumes that all lights were
-           * sampled. However, this code only samples lamps, so if the scene also had mesh lights,
-           * the real probability is twice as high. */
-          if (double_pdf) {
-            ls.pdf *= 2.0f;
-          }
-
-          has_emission = direct_emission(
-              kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission) {
-        if (!blocked) {
-          /* accumulate */
-          path_radiance_accum_light(kg,
-                                    L,
-                                    state,
-                                    throughput * num_samples_inv,
-                                    &L_light,
-                                    shadow,
-                                    num_samples_inv,
-                                    is_lamp);
-        }
-        else {
-          path_radiance_accum_total_light(L, state, throughput * num_samples_inv, &L_light);
-        }
-      }
-    }
-  }
-#  endif
-}
-
-/* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const ShaderClosure *sc,
-                                                    int sample,
-                                                    int num_samples,
-                                                    ccl_addr_space float3 *throughput,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadianceState *L_state,
-                                                    ccl_addr_space Ray *ray,
-                                                    float sum_sample_weight)
-{
-  /* sample BSDF */
-  float bsdf_pdf;
-  BsdfEval bsdf_eval ccl_optional_struct_init;
-  float3 bsdf_omega_in ccl_optional_struct_init;
-  differential3 bsdf_domega_in ccl_optional_struct_init;
-  float bsdf_u, bsdf_v;
-  path_branched_rng_2D(
-      kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-  int label;
-
-  label = shader_bsdf_sample_closure(
-      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-#  ifdef __DENOISING_FEATURES__
-  state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
-#  endif
-
-  /* modify path state */
-  path_state_next(kg, state, label);
-
-  /* setup ray */
-  ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-  ray->D = normalize(bsdf_omega_in);
-  ray->t = FLT_MAX;
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = bsdf_domega_in;
-#  endif
-#  ifdef __OBJECT_MOTION__
-  ray->time = sd->time;
-#  endif
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  if (label & LABEL_TRANSMIT)
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif
-
-  /* branch RNG state */
-  path_state_branch(state, sample, num_samples);
-
-  /* set MIS state */
-  state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
-  state->ray_pdf = bsdf_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-
-  return true;
-}
-
-#endif
-
-/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
-                                                         ShaderData *sd,
-                                                         ShaderData *emission_sd,
-                                                         float3 throughput,
-                                                         ccl_addr_space PathState *state,
-                                                         PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
-
-#ifdef __EMISSION__
-#  ifdef __SHADOW_TRICKS__
-  int all = (state->flag & PATH_RAY_SHADOW_CATCHER);
-  kernel_branched_path_surface_connect_light(kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#  else
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission) {
-    if (!blocked) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-#  endif
-#endif
-}
-
-/* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           ccl_addr_space float3 *throughput,
-                                           ccl_addr_space PathState *state,
-                                           PathRadianceState *L_state,
-                                           ccl_addr_space Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
-
-  /* no BSDF? we can stop here */
-  if (sd->flag & SD_BSDF) {
-    /* sample BSDF */
-    float bsdf_pdf;
-    BsdfEval bsdf_eval ccl_optional_struct_init;
-    float3 bsdf_omega_in ccl_optional_struct_init;
-    differential3 bsdf_domega_in ccl_optional_struct_init;
-    float bsdf_u, bsdf_v;
-    path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-    int label;
-
-    label = shader_bsdf_sample(
-        kg, sd, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-    if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-      return false;
-
-    /* modify throughput */
-    path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-    /* set labels */
-    if (!(label & LABEL_TRANSPARENT)) {
-      state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-      state->ray_t = 0.0f;
-#endif
-      state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
-    }
-
-    /* update path state */
-    path_state_next(kg, state, label);
-
-    /* setup ray */
-    ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-    ray->D = normalize(bsdf_omega_in);
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-    ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-    /* enter/exit volume */
-    if (label & LABEL_TRANSMIT)
-      kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
-    return true;
-  }
-#ifdef __VOLUME__
-  else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    if (!path_state_volume_next(kg, state)) {
-      return false;
-    }
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-    /* setup ray position, direction stays unchanged */
-    ray->P = ray_offset(sd->P, -sd->Ng);
-#  ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-#  endif
-
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-    return true;
-  }
-#endif
-  else {
-    /* no bsdf or volume? */
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
deleted file mode 100644
index a787910e65c..00000000000
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME_SCATTER__
-
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *emission_sd,
-                                                        float3 throughput,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  /* connect to light from given point where shader has been evaluated */
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission && !blocked) {
-    /* accumulate */
-    path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-  }
-#  endif /* __EMISSION__ */
-}
-
-ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space float3 *throughput,
-                                                       ccl_addr_space PathState *state,
-                                                       PathRadianceState *L_state,
-                                                       ccl_addr_space Ray *ray)
-{
-  /* sample phase function */
-  float phase_pdf;
-  BsdfEval phase_eval ccl_optional_struct_init;
-  float3 phase_omega_in ccl_optional_struct_init;
-  differential3 phase_domega_in ccl_optional_struct_init;
-  float phase_u, phase_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
-  int label;
-
-  label = shader_volume_phase_sample(
-      kg, sd, phase_u, phase_v, &phase_eval, &phase_omega_in, &phase_domega_in, &phase_pdf);
-
-  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
-  /* set labels */
-  state->ray_pdf = phase_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-  state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
-  /* update path state */
-  path_state_next(kg, state, label);
-
-  /* Russian roulette termination of volume ray scattering. */
-  float probability = path_state_continuation_probability(kg, state, *throughput);
-
-  if (probability == 0.0f) {
-    return false;
-  }
-  else if (probability != 1.0f) {
-    /* Use dimension from the previous bounce, has not been used yet. */
-    float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM);
-
-    if (terminate >= probability) {
-      return false;
-    }
-
-    *throughput /= probability;
-  }
-
-  /* setup ray */
-  ray->P = sd->P;
-  ray->D = phase_omega_in;
-  ray->t = FLT_MAX;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = phase_domega_in;
-#  endif
-
-  return true;
-}
-
-#  if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__))
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
-                                                          ShaderData *sd,
-                                                          ShaderData *emission_sd,
-                                                          float3 throughput,
-                                                          ccl_addr_space PathState *state,
-                                                          PathRadiance *L,
-                                                          bool sample_all_lights,
-                                                          Ray *ray,
-                                                          const VolumeSegment *segment)
-{
-#    ifdef __EMISSION__
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 1;
-  if (sample_all_lights) {
-    num_lights = kernel_data.integrator.num_all_lights;
-    if (kernel_data.integrator.pdf_triangles != 0.0f) {
-      num_lights += 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; ++i) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = light_select_num_samples(kg, i);
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = kernel_data.integrator.mesh_light_samples;
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = 1.0f / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#      ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#      endif
-      bool has_emission = false;
-
-      float3 tp = throughput;
-
-      if (kernel_data.integrator.use_direct_light) {
-        /* sample random position on random light/triangle */
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
-
-        /* sample position on volume segment */
-        float rphase = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
-        float rscatter = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-                                                                       state,
-                                                                       ray,
-                                                                       sd,
-                                                                       &tp,
-                                                                       rphase,
-                                                                       rscatter,
-                                                                       segment,
-                                                                       (ls.t != FLT_MAX) ? &ls.P :
-                                                                                           NULL,
-                                                                       false);
-
-        if (result == VOLUME_PATH_SCATTERED) {
-          /* todo: split up light_sample so we don't have to call it again with new position */
-          if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-            if (double_pdf) {
-              ls.pdf *= 2.0f;
-            }
-
-            /* sample random light */
-            float terminate = path_branched_rng_light_termination(
-                kg, state->rng_hash, state, j, num_samples);
-            has_emission = direct_emission(
-                kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-          }
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission && !blocked) {
-        /* accumulate */
-        path_radiance_accum_light(
-            kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
-      }
-    }
-  }
-#    endif /* __EMISSION__ */
-}
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __VOLUME_SCATTER__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
index 780830879d8..db8644005ea 100644
--- a/intern/cycles/kernel/kernel_profiling.h
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_PROFILING_H__
-#define __KERNEL_PROFILING_H__
+#pragma once
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_profiling.h"
@@ -24,23 +23,18 @@
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_CPU__
-#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_INIT(kg, event) \
+    ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
 #  define PROFILING_EVENT(event) profiling_helper.set_event(event)
-#  define PROFILING_SHADER(shader) \
-    if ((shader) != SHADER_NONE) { \
-      profiling_helper.set_shader((shader)&SHADER_MASK); \
-    }
-#  define PROFILING_OBJECT(object) \
-    if ((object) != PRIM_NONE) { \
-      profiling_helper.set_object(object); \
-    }
+#  define PROFILING_INIT_FOR_SHADER(kg, event) \
+    ProfilingWithShaderHelper profiling_helper((ProfilingState *)&kg->profiler, event)
+#  define PROFILING_SHADER(object, shader) \
+    profiling_helper.set_shader(object, (shader)&SHADER_MASK);
 #else
 #  define PROFILING_INIT(kg, event)
 #  define PROFILING_EVENT(event)
-#  define PROFILING_SHADER(shader)
-#  define PROFILING_OBJECT(object)
+#  define PROFILING_INIT_FOR_SHADER(kg, event)
+#  define PROFILING_SHADER(object, shader)
 #endif /* __KERNEL_CPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index c33d7150b5c..192bf7ca5aa 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_PROJECTION_CL__
-#define __KERNEL_PROJECTION_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -257,5 +256,3 @@ ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROJECTION_CL__ */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
deleted file mode 100644
index d8cc08b3e85..00000000000
--- a/intern/cycles/kernel/kernel_queues.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_QUEUE_H__
-#define __KERNEL_QUEUE_H__
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * Queue utility functions for split kernel
- */
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#  pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#endif
-
-/*
- * Enqueue ray index into the queue
- */
-ccl_device void enqueue_ray_index(
-    int ray_index,               /* Ray index to be enqueued. */
-    int queue_number,            /* Queue in which the ray index should be enqueued. */
-    ccl_global int *queues,      /* Buffer of all queues. */
-    int queue_size,              /* Size of each queue. */
-    ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
-{
-  /* This thread's queue index. */
-  int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint *)&queue_index[queue_number]) +
-                       (queue_number * queue_size);
-  queues[my_queue_index] = ray_index;
-}
-
-/*
- * Get the ray index for this thread
- * Returns a positive ray_index for threads that have to do some work;
- * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
- * i.e All ray's in the queue has been successfully allocated and there
- * is no more ray to allocate to other threads.
- */
-ccl_device int get_ray_index(
-    KernelGlobals *kg,
-    int thread_index,       /* Global thread index. */
-    int queue_number,       /* Queue to operate on. */
-    ccl_global int *queues, /* Buffer of all queues. */
-    int queuesize,          /* Size of a queue. */
-    int empty_queue)        /* Empty the queue slot as soon as we fetch the ray index. */
-{
-  int ray_index = queues[queue_number * queuesize + thread_index];
-  if (empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
-    queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-  }
-  return ray_index;
-}
-
-/* The following functions are to realize Local memory variant of enqueue ray index function. */
-
-/* All threads should call this function. */
-ccl_device void enqueue_ray_index_local(
-    int ray_index,     /* Ray index to enqueue. */
-    int queue_number,  /* Queue in which to enqueue ray index. */
-    char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
-    int queuesize,     /* queue size. */
-    ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */
-    ccl_global int *Queue_data,                        /* Queues. */
-    ccl_global int *Queue_index)                       /* To do global queue atomics. */
-{
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-
-  /* Get local queue id. */
-  unsigned int lqidx;
-  if (enqueue_flag) {
-    lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue offset. */
-  if (lidx == 0) {
-    *local_queue_atomics = atomic_fetch_and_add_uint32(
-        (ccl_global uint *)&Queue_index[queue_number], *local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue index and enqueue ray. */
-  if (enqueue_flag) {
-    unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
-    Queue_data[my_gqidx] = ray_index;
-  }
-}
-
-ccl_device unsigned int get_local_queue_index(
-    int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-    ccl_local_param unsigned int *local_queue_atomics)
-{
-  int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
-  return my_lqidx;
-}
-
-ccl_device unsigned int get_global_per_queue_offset(
-    int queue_number,
-    ccl_local_param unsigned int *local_queue_atomics,
-    ccl_global int *global_queue_atomics)
-{
-  unsigned int queue_offset = atomic_fetch_and_add_uint32(
-      (ccl_global uint *)&global_queue_atomics[queue_number], local_queue_atomics[queue_number]);
-  return queue_offset;
-}
-
-ccl_device unsigned int get_global_queue_index(
-    int queue_number,
-    int queuesize,
-    unsigned int lqidx,
-    ccl_local_param unsigned int *global_per_queue_offset)
-{
-  int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
-  return my_gqidx;
-}
-
-ccl_device int dequeue_ray_index(int queue_number,
-                                 ccl_global int *queues,
-                                 int queue_size,
-                                 ccl_global int *queue_index)
-{
-  int index = atomic_fetch_and_dec_uint32((ccl_global uint *)&queue_index[queue_number]) - 1;
-
-  if (index < 0) {
-    return QUEUE_EMPTY_SLOT;
-  }
-
-  return queues[index + queue_number * queue_size];
-}
-
-CCL_NAMESPACE_END
-
-#endif  // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 49e5e25c2e0..41b7d76230a 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #include "kernel/kernel_jitter.h"
 #include "util/util_hash.h"
@@ -37,38 +38,34 @@ CCL_NAMESPACE_BEGIN
  */
 #  define SOBOL_SKIP 64
 
-ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
+ccl_device uint sobol_dimension(const KernelGlobals *kg, int index, int dimension)
 {
   uint result = 0;
   uint i = index + SOBOL_SKIP;
   for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
     j += x;
-    result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
+    result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1));
   }
   return result;
 }
 
 #endif /* __SOBOL__ */
 
-ccl_device_forceinline float path_rng_1D(
-    KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(const KernelGlobals *kg,
+                                         uint rng_hash,
+                                         int sample,
+                                         int dimension)
 {
 #ifdef __DEBUG_CORRELATION__
   return (float)drand48();
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    return pmj_sample_1D(kg, sample, rng_hash, dimension);
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    return cmj_sample_1D(sample, num_samples, p);
+    return pmj_sample_1D(kg, sample, rng_hash, dimension);
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol sequence value using direction vectors. */
@@ -88,68 +85,72 @@ ccl_device_forceinline float path_rng_1D(
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
-                                        uint rng_hash,
-                                        int sample,
-                                        int num_samples,
-                                        int dimension,
-                                        float *fx,
-                                        float *fy)
+ccl_device_forceinline void path_rng_2D(
+    const KernelGlobals *kg, uint rng_hash, int sample, int dimension, float *fx, float *fy)
 {
 #ifdef __DEBUG_CORRELATION__
   *fx = (float)drand48();
   *fy = (float)drand48();
   return;
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension);
-    *fx = f.x;
-    *fy = f.y;
-    return;
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    cmj_sample_2D(sample, num_samples, p, fx, fy);
+    pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
+
     return;
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol. */
-  *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
-  *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+  *fx = path_rng_1D(kg, rng_hash, sample, dimension);
+  *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1);
 #endif
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg,
-                                     int sample,
-                                     int num_samples,
-                                     uint *rng_hash,
-                                     int x,
-                                     int y,
-                                     float *fx,
-                                     float *fy)
+/**
+ * 1D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqint1(uint n)
+{
+  n = (n << 13U) ^ n;
+  n = n * (n * n * 15731U + 789221U) + 1376312589U;
+
+  return n;
+}
+
+/**
+ * 2D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqnt2d(const uint x, const uint y)
 {
-  /* load state */
-  *rng_hash = hash_uint2(x, y);
-  *rng_hash ^= kernel_data.integrator.seed;
+  const uint qx = 1103515245U * ((x >> 1U) ^ (y));
+  const uint qy = 1103515245U * ((y >> 1U) ^ (x));
+  const uint n = 1103515245U * ((qx) ^ (qy >> 3U));
+
+  return n;
+}
+
+ccl_device_inline uint path_rng_hash_init(const KernelGlobals *ccl_restrict kg,
+                                          const int sample,
+                                          const int x,
+                                          const int y)
+{
+  const uint rng_hash = hash_iqnt2d(x, y) ^ kernel_data.integrator.seed;
 
 #ifdef __DEBUG_CORRELATION__
-  srand48(*rng_hash + sample);
+  srand48(rng_hash + sample);
+#else
+  (void)sample;
 #endif
 
-  if (sample == 0) {
-    *fx = 0.5f;
-    *fy = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
-  }
+  return rng_hash;
 }
 
 /* Linear Congruential Generator */
@@ -175,113 +176,12 @@ ccl_device uint lcg_init(uint seed)
   return rng;
 }
 
-/* Path Tracing Utility Functions
- *
- * For each random number in each step of the path we must have a unique
- * dimension to avoid using the same sequence twice.
- *
- * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly.
- */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
-                                          const ccl_addr_space PathState *state,
-                                          int dimension)
-{
-  return path_rng_1D(
-      kg, state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_state_rng_2D(
-    KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
-{
-  path_rng_2D(kg,
-              state->rng_hash,
-              state->sample,
-              state->num_samples,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg,
-                                               const ccl_addr_space PathState *state,
-                                               uint hash)
-{
-  /* Use a hash instead of dimension, this is not great but avoids adding
-   * more dimensions to each bounce which reduces quality of dimensions we
-   * are already using. */
-  return path_rng_1D(kg,
-                     cmj_hash_simple(state->rng_hash, hash),
-                     state->sample,
-                     state->num_samples,
-                     state->rng_offset);
-}
-
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg,
-                                             uint rng_hash,
-                                             const ccl_addr_space PathState *state,
-                                             int branch,
-                                             int num_branches,
-                                             int dimension)
-{
-  return path_rng_1D(kg,
-                     rng_hash,
-                     state->sample * num_branches + branch,
-                     state->num_samples * num_branches,
-                     state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg,
-                                            uint rng_hash,
-                                            const ccl_addr_space PathState *state,
-                                            int branch,
-                                            int num_branches,
-                                            int dimension,
-                                            float *fx,
-                                            float *fy)
-{
-  path_rng_2D(kg,
-              rng_hash,
-              state->sample * num_branches + branch,
-              state->num_samples * num_branches,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-/* Utility functions to get light termination value,
- * since it might not be needed in many cases.
- */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg,
-                                                         const ccl_addr_space PathState *state)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg,
-                                                            uint rng_hash,
-                                                            const ccl_addr_space PathState *state,
-                                                            int branch,
-                                                            int num_branches)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_branched_rng_1D(kg, rng_hash, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline uint lcg_state_init(PathState *state, uint scramble)
-{
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
-}
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(const uint rng_hash,
+                                      const uint rng_offset,
+                                      const uint sample,
+                                      const uint scramble)
 {
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
+  return lcg_init(rng_hash + rng_offset + sample * scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
@@ -301,8 +201,6 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
     return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
 #elif defined(__NVCC__)
     return __popc(sample & 0xaaaaaaaa) & 1;
-#elif defined(__KERNEL_OPENCL__)
-    return popcount(sample & 0xaaaaaaaa) & 1;
 #else
     /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
     int i = sample & 0xaaaaaaaa;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7f02e6fc7b3..3052bb53040 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-/*
- * ShaderData, used in four steps:
- *
- * Setup from incoming ray, sampled position and background.
- * Execute for surface, volume or displacement.
- * Evaluate one or more closures.
- * Release.
- */
+/* Functions to evaluate shaders and use the resulting shader closures. */
+
+#pragma once
 
 // clang-format off
 #include "kernel/closure/alloc.h"
@@ -30,479 +25,39 @@
 #include "kernel/closure/emissive.h"
 // clang-format on
 
+#include "kernel/kernel_accumulate.h"
 #include "kernel/svm/svm.h"
 
-CCL_NAMESPACE_BEGIN
-
-/* ShaderData setup from incoming ray */
-
-#ifdef __OBJECT_MOTION__
-ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
-{
-  if (sd->object_flag & SD_OBJECT_MOTION) {
-    sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
-  }
-  else {
-    sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-    sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  }
-}
-#endif
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-    void
-    shader_setup_from_ray(KernelGlobals *kg,
-                          ShaderData *sd,
-                          const Intersection *isect,
-                          const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
-                                                isect->object;
-  sd->lamp = LAMP_NONE;
-
-  sd->type = isect->type;
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-
-  /* matrices and time */
-#ifdef __OBJECT_MOTION__
-  shader_setup_object_transforms(kg, sd, ray->time);
-#endif
-  sd->time = ray->time;
-
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->ray_length = isect->t;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
-    /* curve */
-    curve_shader_setup(kg, sd, isect, ray);
-  }
-  else
-#endif
-      if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* static triangle */
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* vectors */
-    sd->P = triangle_refine(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, false);
-  }
-
-  sd->I = -ray->D;
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#endif
-  }
-
-  /* backfacing test */
-  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#endif
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-  differential_incoming(&sd->dI, ray->dD);
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from BSSRDF scatter */
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    void
-    shader_setup_from_subsurface(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  const bool backfacing = sd->flag & SD_BACKFACING;
-
-  /* object, matrices, time, ray_length stay the same */
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->type = isect->type;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-  /* fetch triangle data */
-  if (sd->type == PRIMITIVE_TRIANGLE) {
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* static triangle */
-    sd->P = triangle_refine_local(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#  ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#  endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, true);
-  }
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#  ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#  endif
-  }
-
-  /* backfacing test */
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#  ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#  endif
-  }
-
-  /* should not get used in principle as the shading will only use a diffuse
-   * BSDF, but the shader might still access it */
-  sd->I = sd->N;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-  /* don't modify dP and dI */
-#  endif
-
-  PROFILING_SHADER(sd->shader);
-}
-#endif
-
-/* ShaderData setup from position sampled on mesh */
-
-ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
-                                                ShaderData *sd,
-                                                const float3 P,
-                                                const float3 Ng,
-                                                const float3 I,
-                                                int shader,
-                                                int object,
-                                                int prim,
-                                                float u,
-                                                float v,
-                                                float t,
-                                                float time,
-                                                bool object_space,
-                                                int lamp)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = P;
-  sd->N = Ng;
-  sd->Ng = Ng;
-  sd->I = I;
-  sd->shader = shader;
-  if (prim != PRIM_NONE)
-    sd->type = PRIMITIVE_TRIANGLE;
-  else if (lamp != LAMP_NONE)
-    sd->type = PRIMITIVE_LAMP;
-  else
-    sd->type = PRIMITIVE_NONE;
-
-  /* primitive */
-  sd->object = object;
-  sd->lamp = LAMP_NONE;
-  /* Currently no access to bvh prim index for strand sd->prim. */
-  sd->prim = prim;
-  sd->u = u;
-  sd->v = v;
-  sd->time = time;
-  sd->ray_length = t;
-
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  if (sd->object != OBJECT_NONE) {
-    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
-
-#ifdef __OBJECT_MOTION__
-    shader_setup_object_transforms(kg, sd, time);
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
-    sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
-    sd->lamp = lamp;
-#else
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->lamp = lamp;
-#endif
-  }
-
-  /* transform into world space */
-  if (object_space) {
-    object_position_transform_auto(kg, sd, &sd->P);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-    sd->N = sd->Ng;
-    object_dir_transform_auto(kg, sd, &sd->I);
-  }
-
-  if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL) {
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-        object_normal_transform_auto(kg, sd, &sd->N);
-      }
-    }
-
-    /* dPdu/dPdv */
-#ifdef __DPDU__
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-
-    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-      object_dir_transform_auto(kg, sd, &sd->dPdu);
-      object_dir_transform_auto(kg, sd, &sd->dPdv);
-    }
-#endif
-  }
-  else {
-#ifdef __DPDU__
-    sd->dPdu = zero_float3();
-    sd->dPdv = zero_float3();
-#endif
-  }
-
-  /* backfacing test */
-  if (sd->prim != PRIM_NONE) {
-    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-    if (backfacing) {
-      sd->flag |= SD_BACKFACING;
-      sd->Ng = -sd->Ng;
-      sd->N = -sd->N;
-#ifdef __DPDU__
-      sd->dPdu = -sd->dPdu;
-      sd->dPdv = -sd->dPdv;
-#endif
-    }
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* no ray differentials here yet */
-  sd->dP = differential3_zero();
-  sd->dI = differential3_zero();
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup for displacement */
-
-ccl_device void shader_setup_from_displace(
-    KernelGlobals *kg, ShaderData *sd, int object, int prim, float u, float v)
-{
-  float3 P, Ng, I = zero_float3();
-  int shader;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  /* force smooth shading for displacement */
-  shader |= SHADER_SMOOTH_NORMAL;
-
-  shader_setup_from_sample(
-      kg,
-      sd,
-      P,
-      Ng,
-      I,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      0.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-}
-
-/* ShaderData setup from ray into background */
-
-ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->D;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = kernel_data.background.surface_shader;
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f;
-
-  sd->object = OBJECT_NONE;
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#endif
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from point inside volume */
-
-#ifdef __VOLUME__
-ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->P;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = SHADER_NONE;
-  sd->flag = 0;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
-
-  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->type = PRIMITIVE_NONE;
-
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#  ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#  endif
-
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-  sd->ray_dP = ray->dP;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-#endif /* __VOLUME__ */
+CCL_NAMESPACE_BEGIN
 
 /* Merging */
 
-#if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device_inline void shader_merge_closures(ShaderData *sd)
+#if defined(__VOLUME__)
+ccl_device_inline void shader_merge_volume_closures(ShaderData *sd)
 {
-  /* merge identical closures, better when we sample a single closure at a time */
+  /* Merge identical closures to save closure space with stacked volumes. */
   for (int i = 0; i < sd->num_closure; i++) {
     ShaderClosure *sci = &sd->closure[i];
 
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
     for (int j = i + 1; j < sd->num_closure; j++) {
       ShaderClosure *scj = &sd->closure[j];
-
-      if (sci->type != scj->type)
+      if (sci->type != scj->type) {
         continue;
-      if (!bsdf_merge(sci, scj))
+      }
+
+      const HenyeyGreensteinVolume *hgi = (const HenyeyGreensteinVolume *)sci;
+      const HenyeyGreensteinVolume *hgj = (const HenyeyGreensteinVolume *)scj;
+      if (!(hgi->g == hgj->g)) {
         continue;
+      }
 
       sci->weight += scj->weight;
       sci->sample_weight += scj->sample_weight;
@@ -520,16 +75,40 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
     }
   }
 }
-#endif /* __BRANCHED_PATH__ || __VOLUME__ */
 
-/* Defensive sampling. */
+ccl_device_inline void shader_copy_volume_phases(ShaderVolumePhases *ccl_restrict phases,
+                                                 const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    const ShaderClosure *from_sc = &sd->closure[i];
+    const HenyeyGreensteinVolume *from_hg = (const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+#endif /* __VOLUME__ */
 
-ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space PathState *state)
+ccl_device_inline void shader_prepare_surface_closures(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
-  /* We can likely also do defensive sampling at deeper bounces, particularly
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
    * for cases like a perfect mirror but possibly also others. This will need
    * a good heuristic. */
-  if (state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+  if (INTEGRATOR_STATE(path, bounce) + INTEGRATOR_STATE(path, transparent_bounce) == 0 &&
+      sd->num_closure > 1) {
     float sum = 0.0f;
 
     for (int i = 0; i < sd->num_closure; i++) {
@@ -546,98 +125,119 @@ ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space Pa
       }
     }
   }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
+    float blur_pdf = kernel_data.integrator.filter_glossy * INTEGRATOR_STATE(path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
 }
 
 /* BSDF */
 
-ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               const float3 omega_in,
-                                               float *pdf,
-                                               const ShaderClosure *skip_sc,
-                                               BsdfEval *result_eval,
-                                               float sum_pdf,
-                                               float sum_sample_weight)
+ccl_device_inline bool shader_bsdf_is_transmission(const ShaderData *sd, const float3 omega_in)
+{
+  return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const float3 omega_in,
+                                                const bool is_transmission,
+                                                const ShaderClosure *skip_sc,
+                                                BsdfEval *result_eval,
+                                                float sum_pdf,
+                                                float sum_sample_weight,
+                                                const uint light_shader_flags)
 {
   /* this is the veach one-sample model with balance heuristic, some pdf
    * factors drop out when using balance heuristic weighting */
   for (int i = 0; i < sd->num_closure; i++) {
     const ShaderClosure *sc = &sd->closure[i];
 
-    if (sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
 
-      if (bsdf_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, 1.0f);
-        sum_pdf += bsdf_pdf * sc->sample_weight;
+        if (bsdf_pdf != 0.0f) {
+          const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                                   CLOSURE_IS_BSDF_BSSRDF(sc->type));
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
       }
 
       sum_sample_weight += sc->sample_weight;
     }
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        const float3 omega_in,
-                                                        BsdfEval *result_eval,
-                                                        float light_pdf,
-                                                        bool use_mis)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-    if (CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
-      if (bsdf_pdf != 0.0f) {
-        float mis_weight = use_mis ? power_heuristic(light_pdf, bsdf_pdf) : 1.0f;
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, mis_weight);
-      }
-    }
-  }
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
-#endif /* __BRANCHED_PATH__ */
 
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
 ccl_device_inline
 #endif
-    void
-    shader_bsdf_eval(KernelGlobals *kg,
+    float
+    shader_bsdf_eval(const KernelGlobals *kg,
                      ShaderData *sd,
                      const float3 omega_in,
-                     BsdfEval *eval,
-                     float light_pdf,
-                     bool use_mis)
+                     const bool is_transmission,
+                     BsdfEval *bsdf_eval,
+                     const uint light_shader_flags)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
-
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
+  bsdf_eval_init(bsdf_eval, false, zero_float3());
 
-#ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched)
-    _shader_bsdf_multi_eval_branched(kg, sd, omega_in, eval, light_pdf, use_mis);
-  else
-#endif
-  {
-    float pdf;
-    _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
-    if (use_mis) {
-      float weight = power_heuristic(light_pdf, pdf);
-      bsdf_eval_mis(eval, weight);
-    }
-  }
+  return _shader_bsdf_multi_eval(
+      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
 }
 
-ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *randu)
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline const ShaderClosure *shader_bsdf_bssrdf_pick(const ShaderData *ccl_restrict sd,
+                                                               float *randu)
 {
-  /* Note the sampling here must match shader_bssrdf_pick,
-   * since we reuse the same random number. */
   int sampled = 0;
 
   if (sd->num_closure > 1) {
@@ -674,106 +274,33 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r
     }
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSDF(sc->type) ? sc : NULL;
+  return &sd->closure[sampled];
 }
 
-ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
-                                                          ccl_addr_space float3 *throughput,
-                                                          float *randu)
+/* Return weight for picked BSSRDF. */
+ccl_device_inline float3 shader_bssrdf_sample_weight(const ShaderData *ccl_restrict sd,
+                                                     const ShaderClosure *ccl_restrict bssrdf_sc)
 {
-  /* Note the sampling here must match shader_bsdf_pick,
-   * since we reuse the same random number. */
-  int sampled = 0;
+  float3 weight = bssrdf_sc->weight;
 
   if (sd->num_closure > 1) {
-    /* Pick a BSDF or BSSRDF or based on sample weights. */
-    float sum_bsdf = 0.0f;
-    float sum_bssrdf = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF(sc->type)) {
-        sum_bsdf += sc->sample_weight;
-      }
-      else if (CLOSURE_IS_BSSRDF(sc->type)) {
-        sum_bssrdf += sc->sample_weight;
-      }
-    }
-
-    float r = (*randu) * (sum_bsdf + sum_bssrdf);
-    float partial_sum = 0.0f;
-
+    float sum = 0.0f;
     for (int i = 0; i < sd->num_closure; i++) {
       const ShaderClosure *sc = &sd->closure[i];
 
       if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r < next_sum) {
-          if (CLOSURE_IS_BSDF(sc->type)) {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
-            return NULL;
-          }
-          else {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
-            sampled = i;
-
-            /* Rescale to reuse for direction sample, to better preserve stratification. */
-            *randu = (r - partial_sum) / sc->sample_weight;
-            break;
-          }
-        }
-
-        partial_sum = next_sum;
+        sum += sc->sample_weight;
       }
     }
+    weight *= sum / bssrdf_sc->sample_weight;
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSSRDF(sc->type) ? sc : NULL;
-}
-
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu,
-                                         float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
-{
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
-
-  const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
-  if (sc == NULL) {
-    *pdf = 0.0f;
-    return LABEL_NONE;
-  }
-
-  /* BSSRDF should already have been handled elsewhere. */
-  kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
-
-    if (sd->num_closure > 1) {
-      float sweight = sc->sample_weight;
-      _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf * sweight, sweight);
-    }
-  }
-
-  return label;
+  return weight;
 }
 
-ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int shader_bsdf_sample_closure(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           const ShaderClosure *sc,
                                           float randu,
@@ -783,7 +310,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
   int label;
   float3 eval = zero_float3();
@@ -791,19 +319,29 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
   *pdf = 0.0f;
   label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
-  if (*pdf != 0.0f)
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
+  if (*pdf != 0.0f) {
+    const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                             CLOSURE_IS_BSDF_BSSRDF(sc->type));
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
+      float sweight = sc->sample_weight;
+      *pdf = _shader_bsdf_multi_eval(
+          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
 
   return label;
 }
 
-ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
+ccl_device float shader_bsdf_average_roughness(const ShaderData *sd)
 {
   float roughness = 0.0f;
   float sum_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF(sc->type)) {
       /* sqrt once to undo the squaring from multiplying roughness on the
@@ -817,17 +355,7 @@ ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
   return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
 }
 
-ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF(sc->type))
-      bsdf_blur(kg, sc, roughness);
-  }
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return one_float3();
@@ -840,7 +368,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *
   }
 }
 
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device void shader_bsdf_disable_transparency(const KernelGlobals *kg, ShaderData *sd)
 {
   if (sd->flag & SD_TRANSPARENT) {
     for (int i = 0; i < sd->num_closure; i++) {
@@ -856,7 +384,7 @@ ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *
   }
 }
 
-ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_alpha(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
 
@@ -866,12 +394,12 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
   return alpha;
 }
 
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_diffuse(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
         CLOSURE_IS_BSDF_BSSRDF(sc->type))
@@ -881,12 +409,12 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_glossy(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
       eval += sc->weight;
@@ -895,12 +423,12 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transmission(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
       eval += sc->weight;
@@ -909,12 +437,12 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_average_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
       N += sc->N * fabsf(average(sc->weight));
   }
@@ -922,59 +450,44 @@ ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
   return (is_zero(N)) ? sd->N : normalize(N);
 }
 
-ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
+ccl_device float3 shader_bsdf_ao_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
       const DiffuseBsdf *bsdf = (const DiffuseBsdf *)sc;
-      eval += sc->weight * ao_factor;
       N += bsdf->N * fabsf(average(sc->weight));
     }
   }
 
-  *N_ = (is_zero(N)) ? sd->N : normalize(N);
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 
 #ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
+ccl_device float3 shader_bssrdf_normal(const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
-  float texture_blur = 0.0f, weight_sum = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSSRDF(sc->type)) {
       const Bssrdf *bssrdf = (const Bssrdf *)sc;
       float avg_weight = fabsf(average(sc->weight));
 
       N += bssrdf->N * avg_weight;
-      eval += sc->weight;
-      texture_blur += bssrdf->texture_blur * avg_weight;
-      weight_sum += avg_weight;
     }
   }
 
-  if (N_)
-    *N_ = (is_zero(N)) ? sd->N : normalize(N);
-
-  if (texture_blur_)
-    *texture_blur_ = safe_divide(texture_blur, weight_sum);
-
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 #endif /* __SUBSURFACE__ */
 
 /* Constant emission optimization */
 
-ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval)
+ccl_device bool shader_constant_emission_eval(const KernelGlobals *kg, int shader, float3 *eval)
 {
   int shader_index = shader & SHADER_MASK;
   int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
@@ -992,7 +505,7 @@ ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, flo
 
 /* Background */
 
-ccl_device float3 shader_background_eval(ShaderData *sd)
+ccl_device float3 shader_background_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return sd->closure_emission_background;
@@ -1004,7 +517,7 @@ ccl_device float3 shader_background_eval(ShaderData *sd)
 
 /* Emission */
 
-ccl_device float3 shader_emissive_eval(ShaderData *sd)
+ccl_device float3 shader_emissive_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
@@ -1016,7 +529,7 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd)
 
 /* Holdout */
 
-ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_apply(const KernelGlobals *kg, ShaderData *sd)
 {
   float3 weight = zero_float3();
 
@@ -1041,7 +554,7 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
   }
   else {
     for (int i = 0; i < sd->num_closure; i++) {
-      ShaderClosure *sc = &sd->closure[i];
+      const ShaderClosure *sc = &sd->closure[i];
       if (CLOSURE_IS_HOLDOUT(sc->type)) {
         weight += sc->weight;
       }
@@ -1053,14 +566,12 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg,
-                                    ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    ccl_global float *buffer,
+template<uint node_feature_mask>
+ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS,
+                                    ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
                                     int path_flag)
 {
-  PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
-
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
    * shader data also do not have a closure array to save GPU memory. */
@@ -1069,7 +580,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   sd->num_closure = 0;
@@ -1078,17 +589,18 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #ifdef __OSL__
   if (kg->osl) {
     if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, sd, state, path_flag);
+      OSLShader::eval_background(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else {
-      OSLShader::eval_surface(kg, sd, state, path_flag);
+      OSLShader::eval_surface(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
   }
   else
 #endif
   {
 #ifdef __SVM__
-    svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag);
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(
+        INTEGRATOR_STATE_PASS, sd, buffer, path_flag);
 #else
     if (sd->object == OBJECT_NONE) {
       sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
@@ -1105,8 +617,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #endif
   }
 
-  if (sd->flag & SD_BSDF_NEEDS_LCG) {
-    sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
+  if (KERNEL_NODES_FEATURE(BSDF) && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+    sd->lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                   INTEGRATOR_STATE(path, rng_offset),
+                                   INTEGRATOR_STATE(path, sample),
+                                   0xb4bc3953);
   }
 }
 
@@ -1114,48 +629,47 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 
 #ifdef __VOLUME__
 
-ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd,
-                                                       const float3 omega_in,
-                                                       float *pdf,
-                                                       int skip_phase,
-                                                       BsdfEval *result_eval,
-                                                       float sum_pdf,
-                                                       float sum_sample_weight)
+ccl_device_inline float _shader_volume_phase_multi_eval(const ShaderData *sd,
+                                                        const ShaderVolumePhases *phases,
+                                                        const float3 omega_in,
+                                                        int skip_phase,
+                                                        BsdfEval *result_eval,
+                                                        float sum_pdf,
+                                                        float sum_sample_weight)
 {
-  for (int i = 0; i < sd->num_closure; i++) {
+  for (int i = 0; i < phases->num_closure; i++) {
     if (i == skip_phase)
       continue;
 
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_PHASE(sc->type)) {
-      float phase_pdf = 0.0f;
-      float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
+    const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
 
-      if (phase_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
-        sum_pdf += phase_pdf * sc->sample_weight;
-      }
-
-      sum_sample_weight += sc->sample_weight;
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
+      sum_pdf += phase_pdf * svc->sample_weight;
     }
+
+    sum_sample_weight += svc->sample_weight;
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
 
-ccl_device void shader_volume_phase_eval(
-    KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf)
+ccl_device float shader_volume_phase_eval(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          BsdfEval *phase_eval)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+  bsdf_eval_init(phase_eval, false, zero_float3());
 
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
-
-  _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
+  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
 
-ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
+ccl_device int shader_volume_phase_sample(const KernelGlobals *kg,
                                           const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
                                           float randu,
                                           float randv,
                                           BsdfEval *phase_eval,
@@ -1163,41 +677,34 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int sampled = 0;
 
-  if (sd->num_closure > 1) {
+  if (phases->num_closure > 1) {
     /* pick a phase closure based on sample weights */
     float sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
-
-      if (CLOSURE_IS_PHASE(sc->type))
-        sum += sc->sample_weight;
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
     }
 
     float r = randu * sum;
     float partial_sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      float next_sum = partial_sum + svc->sample_weight;
 
-      if (CLOSURE_IS_PHASE(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r <= next_sum) {
-          /* Rescale to reuse for BSDF direction sample. */
-          randu = (r - partial_sum) / sc->sample_weight;
-          break;
-        }
-
-        partial_sum = next_sum;
+      if (r <= next_sum) {
+        /* Rescale to reuse for BSDF direction sample. */
+        randu = (r - partial_sum) / svc->sample_weight;
+        break;
       }
+
+      partial_sum = next_sum;
     }
 
-    if (sampled == sd->num_closure) {
+    if (sampled == phases->num_closure) {
       *pdf = 0.0f;
       return LABEL_NONE;
     }
@@ -1205,23 +712,23 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
 
   /* todo: this isn't quite correct, we don't weight anisotropy properly
    * depending on color channels, even if this is perhaps not a common case */
-  const ShaderClosure *sc = &sd->closure[sampled];
+  const ShaderVolumeClosure *svc = &phases->closure[sampled];
   int label;
   float3 eval = zero_float3();
 
   *pdf = 0.0f;
-  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
   }
 
   return label;
 }
 
-ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
+ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
                                            const ShaderData *sd,
-                                           const ShaderClosure *sc,
+                                           const ShaderVolumeClosure *sc,
                                            float randu,
                                            float randv,
                                            BsdfEval *phase_eval,
@@ -1229,8 +736,6 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
                                            differential3 *domega_in,
                                            float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int label;
   float3 eval = zero_float3();
 
@@ -1238,18 +743,18 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
   label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
 
   return label;
 }
 
 /* Volume Evaluation */
 
-ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          ccl_addr_space PathState *state,
-                                          ccl_addr_space VolumeStack *stack,
-                                          int path_flag)
+template<typename StackReadOp>
+ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
+                                          ShaderData *ccl_restrict sd,
+                                          const int path_flag,
+                                          StackReadOp stack_read)
 {
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
@@ -1259,7 +764,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   /* reset closures once at the start, we will be accumulating the closures
@@ -1268,14 +773,18 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
   sd->num_closure_left = max_closures;
   sd->flag = 0;
   sd->object_flag = 0;
-  sd->type = PRIMITIVE_VOLUME;
 
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
     /* setup shaderdata from stack. it's mostly setup already in
      * shader_setup_from_volume, this switching should be quick */
-    sd->object = stack[i].object;
+    sd->object = entry.object;
     sd->lamp = LAMP_NONE;
-    sd->shader = stack[i].shader;
+    sd->shader = entry.shader;
 
     sd->flag &= ~SD_SHADER_FLAGS;
     sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
@@ -1295,18 +804,19 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #  ifdef __SVM__
 #    ifdef __OSL__
     if (kg->osl) {
-      OSLShader::eval_volume(kg, sd, state, path_flag);
+      OSLShader::eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else
 #    endif
     {
-      svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag);
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          INTEGRATOR_STATE_PASS, sd, NULL, path_flag);
     }
 #  endif
 
-    /* merge closures to avoid exceeding number of closures limit */
+    /* Merge closures to avoid exceeding number of closures limit. */
     if (i > 0)
-      shader_merge_closures(sd);
+      shader_merge_volume_closures(sd);
   }
 }
 
@@ -1314,9 +824,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         ccl_addr_space PathState *state)
+ccl_device void shader_eval_displacement(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
   sd->num_closure = 0;
   sd->num_closure_left = 0;
@@ -1325,11 +833,12 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
   if (kg->osl)
-    OSLShader::eval_displacement(kg, sd, state);
+    OSLShader::eval_displacement(INTEGRATOR_STATE_PASS, sd);
   else
 #  endif
   {
-    svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0);
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        INTEGRATOR_STATE_PASS, sd, NULL, 0);
   }
 #endif
 }
@@ -1337,29 +846,13 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 /* Transparent Shadows */
 
 #ifdef __TRANSPARENT_SHADOWS__
-ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
+ccl_device bool shader_transparent_shadow(const KernelGlobals *kg, Intersection *isect)
 {
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
-  int shader = 0;
-
-#  ifdef __HAIR__
-  if (isect->type & PRIMITIVE_ALL_TRIANGLE) {
-#  endif
-    shader = kernel_tex_fetch(__tri_shader, prim);
-#  ifdef __HAIR__
-  }
-  else {
-    float4 str = kernel_tex_fetch(__curves, prim);
-    shader = __float_as_int(str.z);
-  }
-#  endif
-  int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-  return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
+  return (intersection_get_shader_flags(kg, isect) & SD_HAS_TRANSPARENT_SHADOW) != 0;
 }
 #endif /* __TRANSPARENT_SHADOWS__ */
 
-ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+ccl_device float shader_cryptomatte_id(const KernelGlobals *kg, int shader)
 {
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
 }
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
deleted file mode 100644
index 3b124122fba..00000000000
--- a/intern/cycles/kernel/kernel_shadow.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-/* Get PathState ready for use for volume stack evaluation. */
-#  ifdef __SPLIT_KERNEL__
-ccl_addr_space
-#  endif
-    ccl_device_inline PathState *
-    shadow_blocked_volume_path_state(KernelGlobals *kg,
-                                     VolumeState *volume_state,
-                                     ccl_addr_space PathState *state,
-                                     ShaderData *sd,
-                                     Ray *ray)
-{
-#  ifdef __SPLIT_KERNEL__
-  ccl_addr_space PathState *ps =
-      &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-#  else
-  PathState *ps = &volume_state->ps;
-#  endif
-  *ps = *state;
-  /* We are checking for shadow on the "other" side of the surface, so need
-   * to discard volume we are currently at.
-   */
-  if (dot(sd->Ng, ray->D) < 0.0f) {
-    kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
-  }
-  return ps;
-}
-#endif /* __VOLUME__ */
-
-/* Attenuate throughput accordingly to the given intersection event.
- * Returns true if the throughput is zero and traversal can be aborted.
- */
-ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
-                                                            ShaderData *shadow_sd,
-                                                            ccl_addr_space PathState *state,
-#ifdef __VOLUME__
-                                                            ccl_addr_space PathState *volume_state,
-#endif
-                                                            Intersection *isect,
-                                                            Ray *ray,
-                                                            float3 *throughput)
-{
-#ifdef __VOLUME__
-  /* Attenuation between last surface and next surface. */
-  if (volume_state->volume_stack[0].shader != SHADER_NONE) {
-    Ray segment_ray = *ray;
-    segment_ray.t = isect->t;
-    kernel_volume_shadow(kg, shadow_sd, volume_state, &segment_ray, throughput);
-  }
-#endif
-  /* Setup shader data at surface. */
-  shader_setup_from_ray(kg, shadow_sd, isect, ray);
-  /* Attenuation from transparent surface. */
-  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW);
-    path_state_modify_bounce(state, false);
-    *throughput *= shader_bsdf_transparency(kg, shadow_sd);
-  }
-  /* Stop if all light is blocked. */
-  if (is_zero(*throughput)) {
-    return true;
-  }
-#ifdef __VOLUME__
-  /* Exit/enter volume. */
-  kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
-#endif
-  return false;
-}
-
-/* Special version which only handles opaque shadows. */
-ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      const uint visibility,
-                                      Ray *ray,
-                                      Intersection *isect,
-                                      float3 *shadow)
-{
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-#ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-    kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
-  }
-#endif
-  return blocked;
-}
-
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-/* Shadow function to compute how much light is blocked,
- *
- * We trace a single ray. If it hits any opaque surface, or more than a given
- * number of transparent surfaces is hit, then we consider the geometry to be
- * entirely blocked. If not, all transparent surfaces will be recorded and we
- * will shade them one by one to determine how much light is blocked. This all
- * happens in one scene intersection function.
- *
- * Recording all hits works well in some cases but may be slower in others. If
- * we have many semi-transparent hairs, one intersection may be faster because
- * you'd be reinteresecting the same hairs a lot with each step otherwise. If
- * however there is mostly binary transparency then we may be recording many
- * unnecessary intersections when one of the first surfaces blocks all light.
- *
- * From tests in real scenes it seems the performance loss is either minimal,
- * or there is a performance increase anyway due to avoiding the need to send
- * two rays with transparent shadows.
- *
- * On CPU it'll handle all transparent bounces (by allocating storage for
- * intersections when they don't fit into the stack storage).
- *
- * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
- * is something to be kept an eye on.
- */
-
-#    define SHADOW_STACK_MAX_HITS 64
-
-/* Actual logic with traversal loop implementation which is free from device
- * specific tweaks.
- *
- * Note that hits array should be as big as max_hits+1.
- */
-ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    ShaderData *shadow_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    const uint visibility,
-                                                    Ray *ray,
-                                                    Intersection *hits,
-                                                    uint max_hits,
-                                                    float3 *shadow)
-{
-  /* Intersect to find an opaque surface, or record all transparent
-   * surface hits.
-   */
-  uint num_hits;
-  const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  /* If no opaque surface found but we did find transparent hits,
-   * shade them.
-   */
-  if (!blocked && num_hits > 0) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    float last_t = 0.0f;
-    int bounce = state->transparent_bounce;
-    Intersection *isect = hits;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    sort_intersections(hits, num_hits);
-    for (int hit = 0; hit < num_hits; hit++, isect++) {
-      /* Adjust intersection distance for moving ray forward. */
-      float new_t = isect->t;
-      isect->t -= last_t;
-      /* Skip hit if we did not move forward, step by step raytracing
-       * would have skipped it as well then.
-       */
-      if (last_t == new_t) {
-        continue;
-      }
-      last_t = new_t;
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = shadow_sd->P;
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow = throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-/* Here we do all device specific trickery before invoking actual traversal
- * loop to help readability of the actual logic.
- */
-ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *shadow_sd,
-                                               ccl_addr_space PathState *state,
-                                               const uint visibility,
-                                               Ray *ray,
-                                               uint max_hits,
-                                               float3 *shadow)
-{
-#    ifdef __SPLIT_KERNEL__
-  Intersection hits_[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = &hits_[0];
-#    elif defined(__KERNEL_CUDA__)
-  Intersection *hits = kg->hits_stack;
-#    else
-  Intersection hits_stack[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = hits_stack;
-#    endif
-#    ifndef __KERNEL_GPU__
-  /* Prefer to use stack but use dynamic allocation if too deep max hits
-   * we need max_hits + 1 storage space due to the logic in
-   * scene_intersect_shadow_all which will first store and then check if
-   * the limit is exceeded.
-   *
-   * Ignore this on GPU because of slow/unavailable malloc().
-   */
-  if (max_hits + 1 > SHADOW_STACK_MAX_HITS) {
-    if (kg->transparent_shadow_intersections == NULL) {
-      const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-      kg->transparent_shadow_intersections = (Intersection *)malloc(sizeof(Intersection) *
-                                                                    (transparent_max_bounce + 1));
-    }
-    hits = kg->transparent_shadow_intersections;
-  }
-#    endif /* __KERNEL_GPU__ */
-  /* Invoke actual traversal. */
-  return shadow_blocked_transparent_all_loop(
-      kg, sd, shadow_sd, state, visibility, ray, hits, max_hits, shadow);
-}
-#  endif /* __SHADOW_RECORD_ALL__ */
-
-#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
-/* Shadow function to compute how much light is blocked,
- *
- * Here we raytrace from one transparent surface to the next step by step.
- * To minimize overhead in cases where we don't need transparent shadows, we
- * first trace a regular shadow ray. We check if the hit primitive was
- * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency.
- */
-
-/* This function is only implementing device-independent traversal logic
- * which requires some precalculation done.
- */
-ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *shadow_sd,
-                                                        ccl_addr_space PathState *state,
-                                                        const uint visibility,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        const bool blocked,
-                                                        const bool is_transparent_isect,
-                                                        float3 *shadow)
-{
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  if (blocked && is_transparent_isect) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    int bounce = state->transparent_bounce;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    for (;;) {
-      if (bounce >= kernel_data.integrator.transparent_max_bounce) {
-        return true;
-      }
-      if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) {
-        break;
-      }
-      if (!shader_transparent_shadow(kg, isect)) {
-        return true;
-      }
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow *= throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg,
-                                                   ShaderData *sd,
-                                                   ShaderData *shadow_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   const uint visibility,
-                                                   Ray *ray,
-                                                   Intersection *isect,
-                                                   float3 *shadow)
-{
-  bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-  bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false;
-  return shadow_blocked_transparent_stepped_loop(
-      kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow);
-}
-
-#  endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      Ray *ray,
-                                      float3 *shadow)
-{
-  *shadow = one_float3();
-#if !defined(__KERNEL_OPTIX__)
-  /* Some common early checks.
-   * Avoid conditional trace call in OptiX though, since those hurt performance there.
-   */
-  if (ray->t == 0.0f) {
-    return false;
-  }
-#endif
-#ifdef __SHADOW_TRICKS__
-  const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
-                                                                    PATH_RAY_SHADOW;
-#else
-  const uint visibility = PATH_RAY_SHADOW;
-#endif
-  /* Do actual shadow shading.
-   * First of all, we check if integrator requires transparent shadows.
-   * if not, we use simplest and fastest ever way to calculate occlusion.
-   * Do not do this in OptiX to avoid the additional trace call.
-   */
-#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
-  Intersection isect;
-#  ifdef __TRANSPARENT_SHADOWS__
-  if (!kernel_data.integrator.transparent_shadows)
-#  endif
-  {
-    return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
-  }
-#endif
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-  /* For the transparent shadows we try to use record-all logic on the
-   * devices which supports this.
-   */
-  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-  /* Check transparent bounces here, for volume scatter which can do
-   * lighting before surface path termination is checked.
-   */
-  if (state->transparent_bounce >= transparent_max_bounce) {
-    return true;
-  }
-  uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-#    if defined(__KERNEL_OPTIX__)
-  /* Always use record-all behavior in OptiX, but ensure there are no out of bounds
-   * accesses to the hit stack.
-   */
-  max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1);
-#    elif defined(__KERNEL_GPU__)
-  /* On GPU we do tricky with tracing opaque ray first, this avoids speed
-   * regressions in some files.
-   *
-   * TODO(sergey): Check why using record-all behavior causes slowdown in such
-   * cases. Could that be caused by a higher spill pressure?
-   */
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect);
-  const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false;
-  if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) {
-    return shadow_blocked_transparent_stepped_loop(
-        kg, sd, shadow_sd, state, visibility, ray, &isect, blocked, is_transparent_isect, shadow);
-  }
-#    endif /* __KERNEL_GPU__ */
-  return shadow_blocked_transparent_all(
-      kg, sd, shadow_sd, state, visibility, ray, max_hits, shadow);
-#  else  /* __SHADOW_RECORD_ALL__ */
-  /* Fallback to a slowest version which works on all devices. */
-  return shadow_blocked_transparent_stepped(
-      kg, sd, shadow_sd, state, visibility, ray, &isect, shadow);
-#  endif /* __SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-}
-
-#undef SHADOW_STACK_MAX_HITS
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h
new file mode 100644
index 00000000000..824749818a4
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/kernel_path_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether current surface bounce is where path is to be split for the shadow catcher. */
+ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_ARGS,
+                                                                  const int object_flag)
+{
+#ifdef __SHADOW_CATCHER__
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  /* Check the flag first, avoiding fetches form global memory. */
+  if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) {
+    return false;
+  }
+  if (object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) {
+    /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular
+     * object. */
+    return false;
+  }
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    return false;
+  }
+
+  return true;
+#else
+  (void)object_flag;
+  return false;
+#endif
+}
+
+/* Check whether the current path can still split. */
+ccl_device_inline bool kernel_shadow_catcher_path_can_split(INTEGRATOR_STATE_CONST_ARGS)
+{
+  if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+    /* Shadow catcher was already hit and the state was split. No further split is allowed. */
+    return false;
+  }
+
+  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
+}
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(INTEGRATOR_STATE_ARGS, const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(INTEGRATOR_STATE_PASS);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
+#ifdef __SHADOW_CATCHER__
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_HIT) == 0;
+}
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_PASS;
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
deleted file mode 100644
index 677504a4045..00000000000
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* BSSRDF using disk based importance sampling.
- *
- * BSSRDF Importance Sampling, SIGGRAPH 2013
- * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
- */
-
-ccl_device_inline float3
-subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  float3 eval_sum = zero_float3();
-  float pdf_sum = 0.0f;
-  float sample_weight_inv = 0.0f;
-
-  if (!all) {
-    float sample_weight_sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-        sample_weight_sum += sc->sample_weight;
-      }
-    }
-
-    sample_weight_inv = 1.0f / sample_weight_sum;
-  }
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    sc = &sd->closure[i];
-
-    if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-      /* in case of branched path integrate we sample all bssrdf's once,
-       * for path trace we pick one, so adjust pdf for that */
-      float sample_weight = (all) ? 1.0f : sc->sample_weight * sample_weight_inv;
-
-      /* compute pdf */
-      float3 eval = bssrdf_eval(sc, r);
-      float pdf = bssrdf_pdf(sc, disk_r);
-
-      eval_sum += sc->weight * eval;
-      pdf_sum += sample_weight * pdf;
-    }
-  }
-
-  return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3();
-}
-
-ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd,
-                                                      const ShaderClosure *sc,
-                                                      float3 throughput,
-                                                      bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  if (!all) {
-    float bssrdf_weight = 0.0f;
-    float weight = sc->sample_weight;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSSRDF(sc->type)) {
-        bssrdf_weight += sc->sample_weight;
-      }
-    }
-    throughput *= bssrdf_weight / weight;
-  }
-  return throughput;
-}
-
-/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(
-    KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N)
-{
-  sd->flag &= ~SD_CLOSURE_FLAGS;
-  sd->num_closure = 0;
-  sd->num_closure_left = kernel_data.integrator.max_closures;
-
-#ifdef __PRINCIPLED__
-  if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
-        sd, sizeof(PrincipledDiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      bsdf->roughness = roughness;
-      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular Disney principled diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
-    }
-  }
-  else if (CLOSURE_IS_BSDF_BSSRDF(type) || CLOSURE_IS_BSSRDF(type))
-#endif /* __PRINCIPLED__ */
-  {
-    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      sd->flag |= bsdf_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
-    }
-  }
-}
-
-/* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
-ccl_device float3 subsurface_color_pow(float3 color, float exponent)
-{
-  color = max(color, zero_float3());
-
-  if (exponent == 1.0f) {
-    /* nothing to do */
-  }
-  else if (exponent == 0.5f) {
-    color.x = sqrtf(color.x);
-    color.y = sqrtf(color.y);
-    color.z = sqrtf(color.z);
-  }
-  else {
-    color.x = powf(color.x, exponent);
-    color.y = powf(color.y, exponent);
-    color.z = powf(color.z, exponent);
-  }
-
-  return color;
-}
-
-ccl_device void subsurface_color_bump_blur(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float3 *eval, float3 *N)
-{
-  /* average color and texture blur at outgoing point */
-  float texture_blur;
-  float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
-
-  /* do we have bump mapping? */
-  bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
-
-  if (bump || texture_blur > 0.0f) {
-    /* average color and normal at incoming point */
-    shader_eval_surface(kg, sd, state, NULL, state->flag);
-    float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL);
-
-    /* we simply divide out the average color and multiply with the average
-     * of the other one. we could try to do this per closure but it's quite
-     * tricky to match closures between shader evaluations, their number and
-     * order may change, this is simpler */
-    if (texture_blur > 0.0f) {
-      out_color = subsurface_color_pow(out_color, texture_blur);
-      in_color = subsurface_color_pow(in_color, texture_blur);
-
-      *eval *= safe_divide_color(in_color, out_color);
-    }
-  }
-}
-
-/* Subsurface scattering step, from a point on the surface to other
- * nearby points on the same object.
- */
-ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
-                                              LocalIntersection *ss_isect,
-                                              ShaderData *sd,
-                                              const ShaderClosure *sc,
-                                              uint *lcg_state,
-                                              float disk_u,
-                                              float disk_v,
-                                              bool all)
-{
-  /* pick random axis in local frame and point on disk */
-  float3 disk_N, disk_T, disk_B;
-  float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-
-  disk_N = sd->Ng;
-  make_orthonormals(disk_N, &disk_T, &disk_B);
-
-  if (disk_v < 0.5f) {
-    pick_pdf_N = 0.5f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.25f;
-    disk_v *= 2.0f;
-  }
-  else if (disk_v < 0.75f) {
-    float3 tmp = disk_N;
-    disk_N = disk_T;
-    disk_T = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.5f;
-    pick_pdf_B = 0.25f;
-    disk_v = (disk_v - 0.5f) * 4.0f;
-  }
-  else {
-    float3 tmp = disk_N;
-    disk_N = disk_B;
-    disk_B = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.5f;
-    disk_v = (disk_v - 0.75f) * 4.0f;
-  }
-
-  /* sample point on disk */
-  float phi = M_2PI_F * disk_v;
-  float disk_height, disk_r;
-
-  bssrdf_sample(sc, disk_u, &disk_r, &disk_height);
-
-  float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
-
-  /* create ray */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = sd->P + disk_N * disk_height + disk_P;
-  ray->D = -disk_N;
-  ray->t = 2.0f * disk_height;
-  ray->dP = sd->dP;
-  ray->dD = differential3_zero();
-  ray->time = sd->time;
-
-  /* intersect with the same object. if multiple intersections are found it
-   * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-  scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
-  int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
-
-  for (int hit = 0; hit < num_eval_hits; hit++) {
-    /* Quickly retrieve P and Ng without setting up ShaderData. */
-    float3 hit_P;
-    if (sd->type & PRIMITIVE_TRIANGLE) {
-      hit_P = triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray);
-    }
-#ifdef __OBJECT_MOTION__
-    else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
-      float3 verts[3];
-      motion_triangle_vertices(kg,
-                               sd->object,
-                               kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-                               sd->time,
-                               verts);
-      hit_P = motion_triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray, verts);
-    }
-#endif /* __OBJECT_MOTION__ */
-    else {
-      ss_isect->weight[hit] = zero_float3();
-      continue;
-    }
-
-    float3 hit_Ng = ss_isect->Ng[hit];
-    if (ss_isect->hits[hit].object != OBJECT_NONE) {
-      object_normal_transform(kg, sd, &hit_Ng);
-    }
-
-    /* Probability densities for local frame axes. */
-    float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
-    float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
-    float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
-
-    /* Multiple importance sample between 3 axes, power heuristic
-     * found to be slightly better than balance heuristic. pdf_N
-     * in the MIS weight and denominator cancelled out. */
-    float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
-    if (ss_isect->num_hits > BSSRDF_MAX_HITS) {
-      w *= ss_isect->num_hits / (float)BSSRDF_MAX_HITS;
-    }
-
-    /* Real distance to sampled point. */
-    float r = len(hit_P - sd->P);
-
-    /* Evaluate profiles. */
-    float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
-
-    ss_isect->weight[hit] = eval;
-  }
-
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return num_eval_hits;
-}
-
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
-                                                      LocalIntersection *ss_isect,
-                                                      int hit,
-                                                      ShaderData *sd,
-                                                      ccl_addr_space PathState *state,
-                                                      ClosureType type,
-                                                      float roughness)
-{
-  optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
-}
-extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
-#else
-ccl_device_noinline void subsurface_scatter_multi_setup(
-#endif
-    KernelGlobals *kg,
-    LocalIntersection *ss_isect,
-    int hit,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ClosureType type,
-    float roughness)
-{
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-
-  /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
-#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
-  kernel_split_params.dummy_sd_flag = sd->flag;
-#endif
-
-  /* Setup new shading point. */
-  shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
-
-  /* Optionally blur colors and bump mapping. */
-  float3 weight = ss_isect->weight[hit];
-  float3 N = sd->N;
-  subsurface_color_bump_blur(kg, sd, state, &weight, &N);
-
-  /* Setup diffuse BSDF. */
-  subsurface_scatter_setup_diffuse_bsdf(kg, sd, type, roughness, weight, N);
-}
-
-/* Random walk subsurface scattering.
- *
- * "Practical and Controllable Subsurface Scattering for Production Path
- *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-
-ccl_device void subsurface_random_walk_remap(const float A,
-                                             const float d,
-                                             float *sigma_t,
-                                             float *alpha)
-{
-  /* Compute attenuation and scattering coefficients from albedo. */
-  *alpha = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f)));
-  const float s = 1.9f - A + 3.5f * sqr(A - 0.8f);
-
-  *sigma_t = 1.0f / fmaxf(d * s, 1e-16f);
-}
-
-ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
-                                                    float3 *sigma_t,
-                                                    float3 *alpha,
-                                                    float3 *weight)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  const float3 A = bssrdf->albedo;
-  const float3 d = bssrdf->radius;
-  float sigma_t_x, sigma_t_y, sigma_t_z;
-  float alpha_x, alpha_y, alpha_z;
-
-  subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &alpha_x);
-  subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &alpha_y);
-  subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &alpha_z);
-
-  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
-  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
-
-  /* Closure mixing and Fresnel weights separate from albedo. */
-  *weight = safe_divide_color(bssrdf->weight, A);
-}
-
-/* References for Dwivedi sampling:
- *
- * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
- * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
- * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
- *
- * [2] "Improving the Dwivedi Sampling Scheme"
- * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
- * https://cg.ivd.kit.edu/1951.php
- *
- * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
- * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
- * https://iliyan.com/publications/RenderingCourse2020
- */
-
-ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
-{
-  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
-  return 1.0f / ((v - cos_theta) * phase_log);
-}
-
-ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
-{
-  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
-   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
-   * we can implement the power function like this. */
-  return v - (v + 1) * expf(-rand * phase_log);
-}
-
-ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
-{
-  /* Eq. 67 from [3] */
-  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
-}
-
-ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
-{
-  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
-  float phi = M_2PI_F * randv;
-  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
-
-  float3 T, B;
-  make_orthonormals(D, &T, &B);
-  return dir.x * T + dir.y * B + dir.z * D;
-}
-
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
-                                                         float t,
-                                                         bool hit,
-                                                         float3 *transmittance)
-{
-  float3 T = volume_color_transmittance(sigma_t, t);
-  if (transmittance) {
-    *transmittance = T;
-  }
-  return hit ? T : sigma_t * T;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    bool
-    subsurface_random_walk(KernelGlobals *kg,
-                           LocalIntersection *ss_isect,
-                           ShaderData *sd,
-                           ccl_addr_space PathState *state,
-                           const ShaderClosure *sc,
-                           const float bssrdf_u,
-                           const float bssrdf_v,
-                           bool all)
-{
-  /* Sample diffuse surface scatter into the object. */
-  float3 D;
-  float pdf;
-  sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf);
-  if (dot(-sd->Ng, D) <= 0.0f) {
-    return 0;
-  }
-
-  /* Convert subsurface to volume coefficients.
-   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
-  float3 sigma_t, alpha;
-  float3 throughput = one_float3();
-  subsurface_random_walk_coefficients(sc, &sigma_t, &alpha, &throughput);
-  float3 sigma_s = sigma_t * alpha;
-
-  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
-   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
-   * for making the code significantly more complex and slower (if direction sampling depends on
-   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
-   *
-   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
-   * is too low results in fireflies while one that's too high just gives a bit more noise.
-   * Therefore, the code here uses the highest of the three albedos to be safe. */
-  float diffusion_length = diffusion_length_dwivedi(max3(alpha));
-  /* Precompute term for phase sampling. */
-  float phase_log = logf((diffusion_length + 1) / (diffusion_length - 1));
-
-  /* Setup ray. */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->D = D;
-  ray->t = FLT_MAX;
-  ray->time = sd->time;
-
-  /* Modify state for RNGs, decorrelated from other paths. */
-  uint prev_rng_offset = state->rng_offset;
-  uint prev_rng_hash = state->rng_hash;
-  state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef);
-
-  /* Random walk until we hit the surface again. */
-  bool hit = false;
-  bool have_opposite_interface = false;
-  float opposite_distance = 0.0f;
-
-  /* Todo: Disable for alpha>0.999 or so? */
-  const float guided_fraction = 0.75f;
-
-  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
-    /* Advance random number offset. */
-    state->rng_offset += PRNG_BOUNCE_NUM;
-
-    /* Sample color channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
-    float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-    float randt = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    /* We need the result of the raycast to compute the full guided PDF, so just remember the
-     * relevant terms to avoid recomputing them later. */
-    float backward_fraction = 0.0f;
-    float forward_pdf_factor = 0.0f;
-    float forward_stretching = 1.0f;
-    float backward_pdf_factor = 0.0f;
-    float backward_stretching = 1.0f;
-
-    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
-    if (bounce > 0) {
-      /* Decide whether we should use guided or classic sampling. */
-      bool guided = (path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE) < guided_fraction);
-
-      /* Determine if we want to sample away from the incoming interface.
-       * This only happens if we found a nearby opposite interface, and the probability for it
-       * depends on how close we are to it already.
-       * This probability term comes from the recorded presentation of [3]. */
-      bool guide_backward = false;
-      if (have_opposite_interface) {
-        /* Compute distance of the random walk between the tangent plane at the starting point
-         * and the assumed opposite interface (the parallel plane that contains the point we
-         * found in our ray query for the opposite side). */
-        float x = clamp(dot(ray->P - sd->P, -sd->N), 0.0f, opposite_distance);
-        backward_fraction = 1.0f / (1.0f + expf((opposite_distance - 2 * x) / diffusion_length));
-        guide_backward = path_state_rng_1D(kg, state, PRNG_TERMINATE) < backward_fraction;
-      }
-
-      /* Sample scattering direction. */
-      float scatter_u, scatter_v;
-      path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v);
-      float cos_theta;
-      if (guided) {
-        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
-        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
-         * sign here is enough to sample from that instead. */
-        if (guide_backward) {
-          cos_theta = -cos_theta;
-        }
-      }
-      else {
-        cos_theta = 2.0f * scatter_u - 1.0f;
-      }
-      ray->D = direction_from_cosine(sd->N, cos_theta, scatter_v);
-
-      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
-       * Since phase sampling is channel-independent, we can get away with applying a factor
-       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
-       * it cancel with an equivalent term in the numerator of the full estimator.
-       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
-       */
-      forward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta);
-      backward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta);
-
-      /* Prepare distance sampling.
-       * For the backwards case, this also needs the sign swapped since now directions against
-       * sd->N (and therefore with negative cos_theta) are preferred. */
-      forward_stretching = (1.0f - cos_theta / diffusion_length);
-      backward_stretching = (1.0f + cos_theta / diffusion_length);
-      if (guided) {
-        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
-      }
-    }
-
-    /* Sample direction along ray. */
-    float t = -logf(1.0f - randt) / sample_sigma_t;
-
-    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
-     * If yes, we will later use backwards guided sampling in order to have a decent
-     * chance of connecting to it.
-     * Todo: Maybe use less than 10 times the mean free path? */
-    ray->t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
-    scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1);
-    hit = (ss_isect->num_hits > 0);
-
-    if (hit) {
-#ifdef __KERNEL_OPTIX__
-      /* t is always in world space with OptiX. */
-      ray->t = ss_isect->hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = ray->D;
-      object_inverse_dir_transform(kg, sd, &D);
-      D = normalize(D) * ss_isect->hits[0].t;
-      object_dir_transform(kg, sd, &D);
-      ray->t = len(D);
-#endif
-    }
-
-    if (bounce == 0) {
-      /* Check if we hit the opposite side. */
-      if (hit) {
-        have_opposite_interface = true;
-        opposite_distance = dot(ray->P + ray->t * ray->D - sd->P, -sd->N);
-      }
-      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
-       * so check if there would have been a hit in that case. */
-      hit = ray->t < t;
-    }
-
-    /* Use the distance to the exit point for the throughput update if we found one. */
-    if (hit) {
-      t = ray->t;
-    }
-    else if (bounce == 0) {
-      /* Restore original position if nothing was hit after the first bounce,
-       * without the ray_offset() that was added to avoid self-intersection.
-       * Otherwise if that offset is relatively large compared to the scattering
-       * radius, we never go back up high enough to exit the surface. */
-      ray->P = sd->P;
-    }
-
-    /* Advance to new scatter location. */
-    ray->P += t * ray->D;
-
-    float3 transmittance;
-    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
-    if (bounce > 0) {
-      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
-      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
-
-      if (have_opposite_interface) {
-        /* First step of MIS: Depending on geometry we might have two methods for guided
-         * sampling, so perform MIS between them. */
-        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
-        guided_pdf = mix(
-            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
-      }
-      else {
-        /* Just include phase sampling factor otherwise. */
-        guided_pdf *= forward_pdf_factor;
-      }
-
-      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
-      pdf = mix(pdf, guided_pdf, guided_fraction);
-    }
-
-    /* Finally, we're applying MIS again to combine the three color channels.
-     * Altogether, the MIS computation combines up to nine different estimators:
-     * {classic, guided, backward_guided} x {r, g, b} */
-    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
-
-    if (hit) {
-      /* If we hit the surface, we are done. */
-      break;
-    }
-    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
-      /* Avoid unnecessary work and precision issue when throughput gets really small. */
-      break;
-    }
-  }
-
-  kernel_assert(isfinite_safe(throughput.x) && isfinite_safe(throughput.y) &&
-                isfinite_safe(throughput.z));
-
-  state->rng_offset = prev_rng_offset;
-  state->rng_hash = prev_rng_hash;
-
-  /* Return number of hits in ss_isect. */
-  if (!hit) {
-    return 0;
-  }
-
-  /* TODO: gain back performance lost from merging with disk BSSRDF. We
-   * only need to return on hit so this indirect ray push/pop overhead
-   * is not actually needed, but it does keep the code simpler. */
-  ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all);
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return 1;
-}
-
-ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg,
-                                                         LocalIntersection *ss_isect,
-                                                         ShaderData *sd,
-                                                         ccl_addr_space PathState *state,
-                                                         const ShaderClosure *sc,
-                                                         uint *lcg_state,
-                                                         float bssrdf_u,
-                                                         float bssrdf_v,
-                                                         bool all)
-{
-  if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-    return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all);
-  }
-  else {
-    return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index c8e01677d09..bf9b94c1753 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -78,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders)
 KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, __sample_pattern_lut)
+KERNEL_TEX(float, __sample_pattern_lut)
 
 /* image textures */
 KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 7cbe18acf28..927e60e8729 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_TYPES_H__
-#define __KERNEL_TYPES_H__
+#pragma once
 
 #if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
 #  include <embree3/rtcore.h>
@@ -60,27 +59,9 @@ CCL_NAMESPACE_BEGIN
 #define PRIM_NONE (~0)
 #define LAMP_NONE (~0)
 #define ID_NONE (0.0f)
+#define PASS_UNUSED (~0)
 
-#define VOLUME_STACK_SIZE 32
-
-/* Split kernel constants */
-#define WORK_POOL_SIZE_GPU 64
-#define WORK_POOL_SIZE_CPU 1
-#ifdef __KERNEL_GPU__
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
-#else
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
-#endif
-
-#define SHADER_SORT_BLOCK_SIZE 2048
-
-#ifdef __KERNEL_OPENCL__
-#  define SHADER_SORT_LOCAL_SIZE 64
-#elif defined(__KERNEL_CUDA__)
-#  define SHADER_SORT_LOCAL_SIZE 32
-#else
-#  define SHADER_SORT_LOCAL_SIZE 1
-#endif
+#define VOLUME_STACK_SIZE 4
 
 /* Kernel features */
 #define __SOBOL__
@@ -93,7 +74,7 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
-#define __SHADOW_TRICKS__
+#define __SHADOW_CATCHER__
 #define __DENOISING_FEATURES__
 #define __SHADER_RAYTRACE__
 #define __AO__
@@ -102,7 +83,6 @@ CCL_NAMESPACE_BEGIN
 #define __SVM__
 #define __EMISSION__
 #define __HOLDOUT__
-#define __MULTI_CLOSURE__
 #define __TRANSPARENT_SHADOWS__
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
@@ -112,7 +92,6 @@ CCL_NAMESPACE_BEGIN
 #define __PRINCIPLED__
 #define __SUBSURFACE__
 #define __VOLUME__
-#define __VOLUME_SCATTER__
 #define __CMJ__
 #define __SHADOW_RECORD_ALL__
 #define __BRANCHED_PATH__
@@ -122,106 +101,60 @@ CCL_NAMESPACE_BEGIN
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_RECORD_ALL__
 #endif /* __KERNEL_CPU__ */
 
-#ifdef __KERNEL_CUDA__
-#  ifdef __SPLIT_KERNEL__
-#    undef __BRANCHED_PATH__
-#  endif
-#endif /* __KERNEL_CUDA__ */
-
 #ifdef __KERNEL_OPTIX__
 #  undef __BAKING__
-#  undef __BRANCHED_PATH__
 #endif /* __KERNEL_OPTIX__ */
 
-#ifdef __KERNEL_OPENCL__
-#endif /* __KERNEL_OPENCL__ */
-
 /* Scene-based selective features compilation. */
-#ifdef __NO_CAMERA_MOTION__
-#  undef __CAMERA_MOTION__
-#endif
-#ifdef __NO_OBJECT_MOTION__
-#  undef __OBJECT_MOTION__
-#endif
-#ifdef __NO_HAIR__
-#  undef __HAIR__
-#endif
-#ifdef __NO_VOLUME__
-#  undef __VOLUME__
-#  undef __VOLUME_SCATTER__
-#endif
-#ifdef __NO_SUBSURFACE__
-#  undef __SUBSURFACE__
-#endif
-#ifdef __NO_BAKING__
-#  undef __BAKING__
-#endif
-#ifdef __NO_BRANCHED_PATH__
-#  undef __BRANCHED_PATH__
-#endif
-#ifdef __NO_PATCH_EVAL__
-#  undef __PATCH_EVAL__
-#endif
-#ifdef __NO_TRANSPARENT__
-#  undef __TRANSPARENT_SHADOWS__
-#endif
-#ifdef __NO_SHADOW_TRICKS__
-#  undef __SHADOW_TRICKS__
-#endif
-#ifdef __NO_PRINCIPLED__
-#  undef __PRINCIPLED__
-#endif
-#ifdef __NO_DENOISING__
-#  undef __DENOISING_FEATURES__
-#endif
-#ifdef __NO_SHADER_RAYTRACE__
-#  undef __SHADER_RAYTRACE__
+#ifdef __KERNEL_FEATURES__
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION)
+#    undef __CAMERA_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION)
+#    undef __OBJECT_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_HAIR)
+#    undef __HAIR__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_VOLUME)
+#    undef __VOLUME__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
+#    undef __SUBSURFACE__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
+#    undef __BAKING__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
+#    undef __PATCH_EVAL__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_TRANSPARENT)
+#    undef __TRANSPARENT_SHADOWS__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER)
+#    undef __SHADOW_CATCHER__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED)
+#    undef __PRINCIPLED__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING)
+#    undef __DENOISING_FEATURES__
+#  endif
 #endif
 
 #ifdef WITH_CYCLES_DEBUG_NAN
 #  define __KERNEL_DEBUG_NAN__
 #endif
 
+/* Features that enable others */
+
 #if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__)
 #  define __BVH_LOCAL__
 #endif
 
-/* Shader Evaluation */
-
-typedef enum ShaderEvalType {
-  SHADER_EVAL_DISPLACE,
-  SHADER_EVAL_BACKGROUND,
-  /* bake types */
-  SHADER_EVAL_BAKE, /* no real shade, it's used in the code to
-                     * differentiate the type of shader eval from the above
-                     */
-  /* data passes */
-  SHADER_EVAL_NORMAL,
-  SHADER_EVAL_UV,
-  SHADER_EVAL_ROUGHNESS,
-  SHADER_EVAL_DIFFUSE_COLOR,
-  SHADER_EVAL_GLOSSY_COLOR,
-  SHADER_EVAL_TRANSMISSION_COLOR,
-  SHADER_EVAL_EMISSION,
-  SHADER_EVAL_AOV_COLOR,
-  SHADER_EVAL_AOV_VALUE,
-
-  /* light passes */
-  SHADER_EVAL_AO,
-  SHADER_EVAL_COMBINED,
-  SHADER_EVAL_SHADOW,
-  SHADER_EVAL_DIFFUSE,
-  SHADER_EVAL_GLOSSY,
-  SHADER_EVAL_TRANSMISSION,
-
-  /* extra */
-  SHADER_EVAL_ENVIRONMENT,
-} ShaderEvalType;
-
 /* Path Tracing
  * note we need to keep the u/v pairs at even values */
 
@@ -252,8 +185,7 @@ enum PathTraceDimension {
 
 enum SamplingPattern {
   SAMPLING_PATTERN_SOBOL = 0,
-  SAMPLING_PATTERN_CMJ = 1,
-  SAMPLING_PATTERN_PMJ = 2,
+  SAMPLING_PATTERN_PMJ = 1,
 
   SAMPLING_NUM_PATTERNS,
 };
@@ -261,7 +193,12 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-  /* Ray visibility. */
+  /* --------------------------------------------------------------------
+   * Ray visibility.
+   *
+   * NOTE: Recalculated after a surface bounce.
+   */
+
   PATH_RAY_CAMERA = (1 << 0),
   PATH_RAY_REFLECT = (1 << 1),
   PATH_RAY_TRANSMIT = (1 << 2),
@@ -269,57 +206,106 @@ enum PathRayFlag {
   PATH_RAY_GLOSSY = (1 << 4),
   PATH_RAY_SINGULAR = (1 << 5),
   PATH_RAY_TRANSPARENT = (1 << 6),
+  PATH_RAY_VOLUME_SCATTER = (1 << 7),
 
   /* Shadow ray visibility. */
-  PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
-  PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
-  PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER),
-  PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9),
-  PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10),
-  PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
-  PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+  PATH_RAY_SHADOW_OPAQUE = (1 << 8),
+  PATH_RAY_SHADOW_TRANSPARENT = (1 << 9),
   PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT),
 
-  /* Unused, free to reuse. */
-  PATH_RAY_UNUSED = (1 << 11),
+  /* Special flag to tag unaligned BVH nodes.
+   * Only set and used in BVH nodes to distinguish how to interpret bounding box information stored
+   * in the node (either it should be intersected as AABB or as OBB). */
+  PATH_RAY_NODE_UNALIGNED = (1 << 10),
 
-  /* Ray visibility for volume scattering. */
-  PATH_RAY_VOLUME_SCATTER = (1 << 12),
-
-  /* Special flag to tag unaligned BVH nodes. */
-  PATH_RAY_NODE_UNALIGNED = (1 << 13),
+  /* Subset of flags used for ray visibility for intersection.
+   *
+   * NOTE: SHADOW_CATCHER macros below assume there are no more than
+   * 16 visibility bits. */
+  PATH_RAY_ALL_VISIBILITY = ((1 << 11) - 1),
 
-  PATH_RAY_ALL_VISIBILITY = ((1 << 14) - 1),
+  /* --------------------------------------------------------------------
+   * Path flags.
+   */
 
   /* Don't apply multiple importance sampling weights to emission from
    * lamp or surface hits, because they were not direct light sampled. */
-  PATH_RAY_MIS_SKIP = (1 << 14),
+  PATH_RAY_MIS_SKIP = (1 << 11),
+
   /* Diffuse bounce earlier in the path, skip SSS to improve performance
    * and avoid branching twice with disk sampling SSS. */
-  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15),
+  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 12),
+
   /* Single pass has been written. */
-  PATH_RAY_SINGLE_PASS_DONE = (1 << 16),
-  /* Ray is behind a shadow catcher. */
-  PATH_RAY_SHADOW_CATCHER = (1 << 17),
-  /* Store shadow data for shadow catcher or denoising. */
-  PATH_RAY_STORE_SHADOW_INFO = (1 << 18),
+  PATH_RAY_SINGLE_PASS_DONE = (1 << 13),
+
   /* Zero background alpha, for camera or transparent glass rays. */
-  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19),
+  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 14),
+
   /* Terminate ray immediately at next bounce. */
-  PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20),
+  PATH_RAY_TERMINATE_ON_NEXT_SURFACE = (1 << 15),
+  PATH_RAY_TERMINATE_IN_NEXT_VOLUME = (1 << 16),
+
   /* Ray is to be terminated, but continue with transparent bounces and
    * emission as long as we encounter them. This is required to make the
    * MIS between direct and indirect light rays match, as shadow rays go
    * through transparent surfaces to reach emission too. */
-  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
+  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 17),
+
+  /* Terminate ray immediately after volume shading. */
+  PATH_RAY_TERMINATE_AFTER_VOLUME = (1 << 18),
+
   /* Ray is to be terminated. */
-  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
+  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_ON_NEXT_SURFACE | PATH_RAY_TERMINATE_IN_NEXT_VOLUME |
+                        PATH_RAY_TERMINATE_AFTER_TRANSPARENT | PATH_RAY_TERMINATE_AFTER_VOLUME),
+
   /* Path and shader is being evaluated for direct lighting emission. */
-  PATH_RAY_EMISSION = (1 << 22)
+  PATH_RAY_EMISSION = (1 << 19),
+
+  /* Perform subsurface scattering. */
+  PATH_RAY_SUBSURFACE = (1 << 20),
+
+  /* Contribute to denoising features. */
+  PATH_RAY_DENOISING_FEATURES = (1 << 21),
+
+  /* Render pass categories. */
+  PATH_RAY_REFLECT_PASS = (1 << 22),
+  PATH_RAY_TRANSMISSION_PASS = (1 << 23),
+  PATH_RAY_VOLUME_PASS = (1 << 24),
+  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
+
+  /* Shadow ray is for a light or surface. */
+  PATH_RAY_SHADOW_FOR_LIGHT = (1 << 25),
+
+  /* A shadow catcher object was hit and the path was split into two. */
+  PATH_RAY_SHADOW_CATCHER_HIT = (1 << 26),
+
+  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
+   * their dedicated pass for later division.
+   *
+   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
+   * which is separate from the light passes. */
+  PATH_RAY_SHADOW_CATCHER_PASS = (1 << 27),
+
+  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1 << 28),
 };
 
+/* Configure ray visibility bits for rays and objects respectively,
+ * to make shadow catchers work.
+ *
+ * On shadow catcher paths we want to ignore any intersections with non-catchers,
+ * whereas on regular paths we want to intersect all objects. */
+
+#define SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) ((visibility) << 16)
+
+#define SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility) \
+  (((path_flag)&PATH_RAY_SHADOW_CATCHER_PASS) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : \
+                                                (visibility))
+
+#define SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility) \
+  (((is_shadow_catcher) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : 0) | (visibility))
+
 /* Closure Label */
 
 typedef enum ClosureLabel {
@@ -332,6 +318,7 @@ typedef enum ClosureLabel {
   LABEL_TRANSPARENT = 32,
   LABEL_VOLUME_SCATTER = 64,
   LABEL_TRANSMIT_TRANSPARENT = 128,
+  LABEL_SUBSURFACE_SCATTER = 256,
 } ClosureLabel;
 
 /* Render Passes */
@@ -339,17 +326,35 @@ typedef enum ClosureLabel {
 #define PASS_NAME_JOIN(a, b) a##_##b
 #define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32))
 
-#define PASSMASK_COMPONENT(comp) \
-  (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \
-   PASSMASK(PASS_NAME_JOIN(comp, COLOR)))
-
+// NOTE: Keep in sync with `Pass::get_type_enum()`.
 typedef enum PassType {
   PASS_NONE = 0,
 
-  /* Main passes */
+  /* Light Passes */
   PASS_COMBINED = 1,
-  PASS_DEPTH,
+  PASS_EMISSION,
+  PASS_BACKGROUND,
+  PASS_AO,
+  PASS_SHADOW,
+  PASS_DIFFUSE,
+  PASS_DIFFUSE_DIRECT,
+  PASS_DIFFUSE_INDIRECT,
+  PASS_GLOSSY,
+  PASS_GLOSSY_DIRECT,
+  PASS_GLOSSY_INDIRECT,
+  PASS_TRANSMISSION,
+  PASS_TRANSMISSION_DIRECT,
+  PASS_TRANSMISSION_INDIRECT,
+  PASS_VOLUME,
+  PASS_VOLUME_DIRECT,
+  PASS_VOLUME_INDIRECT,
+  PASS_CATEGORY_LIGHT_END = 31,
+
+  /* Data passes */
+  PASS_DEPTH = 32,
+  PASS_POSITION,
   PASS_NORMAL,
+  PASS_ROUGHNESS,
   PASS_UV,
   PASS_OBJECT_ID,
   PASS_MATERIAL_ID,
@@ -361,31 +366,35 @@ typedef enum PassType {
   PASS_AOV_VALUE,
   PASS_ADAPTIVE_AUX_BUFFER,
   PASS_SAMPLE_COUNT,
-  PASS_CATEGORY_MAIN_END = 31,
-
-  PASS_MIST = 32,
-  PASS_EMISSION,
-  PASS_BACKGROUND,
-  PASS_AO,
-  PASS_SHADOW,
-  PASS_LIGHT, /* no real pass, used to force use_light_pass */
-  PASS_DIFFUSE_DIRECT,
-  PASS_DIFFUSE_INDIRECT,
   PASS_DIFFUSE_COLOR,
-  PASS_GLOSSY_DIRECT,
-  PASS_GLOSSY_INDIRECT,
   PASS_GLOSSY_COLOR,
-  PASS_TRANSMISSION_DIRECT,
-  PASS_TRANSMISSION_INDIRECT,
   PASS_TRANSMISSION_COLOR,
-  PASS_VOLUME_DIRECT = 50,
-  PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
-  PASS_CATEGORY_LIGHT_END = 63,
+  PASS_MIST,
+  PASS_DENOISING_NORMAL,
+  PASS_DENOISING_ALBEDO,
+
+  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
+   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
+   * result of this division is then to be multiplied with the backdrop. The alpha channel of this
+   * pass contains number of samples which contributed to the color components of the pass.
+   *
+   * PASS_SHADOW_CATCHER_SAMPLE_COUNT contains number of samples for which the path split
+   * happenned.
+   *
+   * PASS_SHADOW_CATCHER_MATTE contains pass which contains non-catcher objects. This pass is to be
+   * alpha-overed onto the backdrop (after multiplication). */
+  PASS_SHADOW_CATCHER,
+  PASS_SHADOW_CATCHER_SAMPLE_COUNT,
+  PASS_SHADOW_CATCHER_MATTE,
+
+  PASS_CATEGORY_DATA_END = 63,
 
   PASS_BAKE_PRIMITIVE,
   PASS_BAKE_DIFFERENTIAL,
-  PASS_CATEGORY_BAKE_END = 95
+  PASS_CATEGORY_BAKE_END = 95,
+
+  PASS_NUM,
 } PassType;
 
 #define PASS_ANY (~0)
@@ -398,158 +407,9 @@ typedef enum CryptomatteType {
   CRYPT_ACCURATE = (1 << 3),
 } CryptomatteType;
 
-typedef enum DenoisingPassOffsets {
-  DENOISING_PASS_NORMAL = 0,
-  DENOISING_PASS_NORMAL_VAR = 3,
-  DENOISING_PASS_ALBEDO = 6,
-  DENOISING_PASS_ALBEDO_VAR = 9,
-  DENOISING_PASS_DEPTH = 12,
-  DENOISING_PASS_DEPTH_VAR = 13,
-  DENOISING_PASS_SHADOW_A = 14,
-  DENOISING_PASS_SHADOW_B = 17,
-  DENOISING_PASS_COLOR = 20,
-  DENOISING_PASS_COLOR_VAR = 23,
-  DENOISING_PASS_CLEAN = 26,
-
-  DENOISING_PASS_PREFILTERED_DEPTH = 0,
-  DENOISING_PASS_PREFILTERED_NORMAL = 1,
-  DENOISING_PASS_PREFILTERED_SHADOWING = 4,
-  DENOISING_PASS_PREFILTERED_ALBEDO = 5,
-  DENOISING_PASS_PREFILTERED_COLOR = 8,
-  DENOISING_PASS_PREFILTERED_VARIANCE = 11,
-  DENOISING_PASS_PREFILTERED_INTENSITY = 14,
-
-  DENOISING_PASS_SIZE_BASE = 26,
-  DENOISING_PASS_SIZE_CLEAN = 3,
-  DENOISING_PASS_SIZE_PREFILTERED = 15,
-} DenoisingPassOffsets;
-
-typedef enum eBakePassFilter {
-  BAKE_FILTER_NONE = 0,
-  BAKE_FILTER_DIRECT = (1 << 0),
-  BAKE_FILTER_INDIRECT = (1 << 1),
-  BAKE_FILTER_COLOR = (1 << 2),
-  BAKE_FILTER_DIFFUSE = (1 << 3),
-  BAKE_FILTER_GLOSSY = (1 << 4),
-  BAKE_FILTER_TRANSMISSION = (1 << 5),
-  BAKE_FILTER_EMISSION = (1 << 6),
-  BAKE_FILTER_AO = (1 << 7),
-} eBakePassFilter;
-
-typedef enum BakePassFilterCombos {
-  BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
-                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
-                          BAKE_FILTER_AO),
-  BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-} BakePassFilterCombos;
-
-typedef enum DenoiseFlag {
-  DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
-  DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
-  DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
-  DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
-  DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
-  DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
-  DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
-} DenoiseFlag;
-
-typedef ccl_addr_space struct PathRadianceState {
-#ifdef __PASSES__
-  float3 diffuse;
-  float3 glossy;
-  float3 transmission;
-  float3 volume;
-
-  float3 direct;
-#endif
-} PathRadianceState;
-
-typedef ccl_addr_space struct PathRadiance {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
-  float transparent;
-  float3 emission;
-#ifdef __PASSES__
-  float3 background;
-  float3 ao;
-
-  float3 indirect;
-  float3 direct_emission;
-
-  float3 color_diffuse;
-  float3 color_glossy;
-  float3 color_transmission;
-
-  float3 direct_diffuse;
-  float3 direct_glossy;
-  float3 direct_transmission;
-  float3 direct_volume;
-
-  float3 indirect_diffuse;
-  float3 indirect_glossy;
-  float3 indirect_transmission;
-  float3 indirect_volume;
-
-  float3 shadow;
-  float mist;
-#endif
-
-  struct PathRadianceState state;
-
-#ifdef __SHADOW_TRICKS__
-  /* Total light reachable across the path, ignoring shadow blocked queries. */
-  float3 path_total;
-  /* Total light reachable across the path with shadow blocked queries
-   * applied here.
-   *
-   * Dividing this figure by path_total will give estimate of shadow pass.
-   */
-  float3 path_total_shaded;
-
-  /* Color of the background on which shadow is alpha-overed. */
-  float3 shadow_background_color;
-
-  /* Path radiance sum and throughput at the moment when ray hits shadow
-   * catcher object.
-   */
-  float shadow_throughput;
-
-  /* Accumulated transparency along the path after shadow catcher bounce. */
-  float shadow_transparency;
-
-  /* Indicate if any shadow catcher data is set. */
-  int has_shadow_catcher;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  float3 denoising_normal;
-  float3 denoising_albedo;
-  float denoising_depth;
-#endif /* __DENOISING_FEATURES__ */
-} PathRadiance;
-
 typedef struct BsdfEval {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
   float3 diffuse;
-#ifdef __PASSES__
   float3 glossy;
-  float3 transmission;
-  float3 transparent;
-  float3 volume;
-#endif
-#ifdef __SHADOW_TRICKS__
-  float3 sum_no_mis;
-#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -564,8 +424,10 @@ typedef enum ShaderFlag {
   SHADER_EXCLUDE_TRANSMIT = (1 << 25),
   SHADER_EXCLUDE_CAMERA = (1 << 24),
   SHADER_EXCLUDE_SCATTER = (1 << 23),
+  SHADER_EXCLUDE_SHADOW_CATCHER = (1 << 22),
   SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE | SHADER_EXCLUDE_GLOSSY | SHADER_EXCLUDE_TRANSMIT |
-                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER),
+                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER |
+                        SHADER_EXCLUDE_SHADOW_CATCHER),
 
   SHADER_MASK = ~(SHADER_SMOOTH_NORMAL | SHADER_CAST_SHADOW | SHADER_AREA_LIGHT | SHADER_USE_MIS |
                   SHADER_EXCLUDE_ANY)
@@ -612,29 +474,14 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
-/* TODO(sergey): This is only needed because current AMD
- * compiler has hard time building the kernel with this
- * reshuffle. And at the same time reshuffle will cause
- * less optimal CPU code in certain places.
- *
- * We'll get rid of this nasty exception once AMD compiler
- * is fixed.
- */
-#ifndef __KERNEL_OPENCL_AMD__
   float3 P;   /* origin */
   float3 D;   /* direction */
   float t;    /* length of the ray */
   float time; /* time (for motion blur) */
-#else
-  float t;    /* length of the ray */
-  float time; /* time (for motion blur) */
-  float3 P;   /* origin */
-  float3 D;   /* direction */
-#endif
 
 #ifdef __RAY_DIFFERENTIALS__
-  differential3 dP;
-  differential3 dD;
+  float dP;
+  float dD;
 #endif
 } Ray;
 
@@ -661,9 +508,6 @@ typedef enum PrimitiveType {
   PRIMITIVE_CURVE_RIBBON = (1 << 4),
   PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
   PRIMITIVE_VOLUME = (1 << 6),
-  /* Lamp primitive is not included below on purpose,
-   * since it is no real traceable primitive.
-   */
   PRIMITIVE_LAMP = (1 << 7),
 
   PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
@@ -672,16 +516,14 @@ typedef enum PrimitiveType {
   PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
   PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
                           PRIMITIVE_MOTION_CURVE_RIBBON),
-  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME),
+  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME |
+                   PRIMITIVE_LAMP),
 
-  /* Total number of different traceable primitives.
-   * NOTE: This is an actual value, not a bitflag.
-   */
-  PRIMITIVE_NUM_TOTAL = 7,
+  PRIMITIVE_NUM = 8,
 } PrimitiveType;
 
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM)
 
 typedef enum CurveShapeType {
   CURVE_RIBBON = 0,
@@ -760,20 +602,14 @@ typedef struct AttributeDescriptor {
 
 /* Closure data */
 
-#ifdef __MULTI_CLOSURE__
-#  ifdef __SPLIT_KERNEL__
-#    define MAX_CLOSURE 1
-#  else
-#    ifndef __MAX_CLOSURE__
-#      define MAX_CLOSURE 64
-#    else
-#      define MAX_CLOSURE __MAX_CLOSURE__
-#    endif
-#  endif
+#ifndef __MAX_CLOSURE__
+#  define MAX_CLOSURE 64
 #else
-#  define MAX_CLOSURE 1
+#  define MAX_CLOSURE __MAX_CLOSURE__
 #endif
 
+#define MAX_VOLUME_CLOSURE 8
+
 /* This struct is the base class for all closures. The common members are
  * duplicated in all derived classes since we don't have C++ in the kernel
  * yet, and because it lets us lay out the members to minimize padding. The
@@ -866,11 +702,14 @@ enum ShaderDataFlag {
   SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
   /* Shader has emission */
   SD_HAS_EMISSION = (1 << 29),
+  /* Shader has raytracing */
+  SD_HAS_RAYTRACE = (1 << 30),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
                      SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
-                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
+                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES | SD_HAS_EMISSION |
+                     SD_HAS_RAYTRACE)
 };
 
 /* Object flags. */
@@ -955,19 +794,19 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
 #endif
 
 #ifdef __OBJECT_MOTION__
-  /* object <-> world space transformations, cached to avoid
-   * re-interpolating them constantly for shading */
-  Transform ob_tfm;
-  Transform ob_itfm;
+  /* Object <-> world space transformations for motion blur, cached to avoid
+   * re-interpolating them constantly for shading. */
+  Transform ob_tfm_motion;
+  Transform ob_itfm_motion;
 #endif
 
   /* ray start position, only set for backgrounds */
   float3 ray_P;
-  differential3 ray_dP;
+  float ray_dP;
 
 #ifdef __OSL__
-  struct KernelGlobals *osl_globals;
-  struct PathState *osl_path_state;
+  const struct KernelGlobals *osl_globals;
+  const struct IntegratorStateCPU *osl_path_state;
 #endif
 
   /* LCG state for closures that require additional random numbers. */
@@ -976,7 +815,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
   /* Closure data, we store a fixed array of closures */
   int num_closure;
   int num_closure_left;
-  float randb_closure;
   float3 svm_closure_weight;
 
   /* Closure weights summed directly, so we can evaluate
@@ -998,7 +836,22 @@ typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage
 ShaderDataTinyStorage;
 #define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage)
 
-/* Path State */
+/* Compact volume closures storage.
+ *
+ * Used for decoupled direct/indirect light closure storage. */
+
+ccl_addr_space struct ShaderVolumeClosure {
+  float3 weight;
+  float sample_weight;
+  float g;
+};
+
+ccl_addr_space struct ShaderVolumePhases {
+  ShaderVolumeClosure closure[MAX_VOLUME_CLOSURE];
+  int num_closure;
+};
+
+/* Volume Stack */
 
 #ifdef __VOLUME__
 typedef struct VolumeStack {
@@ -1007,53 +860,6 @@ typedef struct VolumeStack {
 } VolumeStack;
 #endif
 
-typedef struct PathState {
-  /* see enum PathRayFlag */
-  int flag;
-
-  /* random number generator state */
-  uint rng_hash;       /* per pixel hash */
-  int rng_offset;      /* dimension offset */
-  int sample;          /* path sample number */
-  int num_samples;     /* total number of times this path will be sampled */
-  float branch_factor; /* number of branches in indirect paths */
-
-  /* bounce counting */
-  int bounce;
-  int diffuse_bounce;
-  int glossy_bounce;
-  int transmission_bounce;
-  int transparent_bounce;
-
-#ifdef __DENOISING_FEATURES__
-  float denoising_feature_weight;
-  float3 denoising_feature_throughput;
-#endif /* __DENOISING_FEATURES__ */
-
-  /* multiple importance sampling */
-  float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
-  float ray_pdf;     /* last bounce pdf */
-#ifdef __LAMP_MIS__
-  float ray_t; /* accumulated distance through transparent surfaces */
-#endif
-
-  /* volume rendering */
-#ifdef __VOLUME__
-  int volume_bounce;
-  int volume_bounds_bounce;
-  VolumeStack volume_stack[VOLUME_STACK_SIZE];
-#endif
-} PathState;
-
-#ifdef __VOLUME__
-typedef struct VolumeState {
-#  ifdef __SPLIT_KERNEL__
-#  else
-  PathState ps;
-#  endif
-} VolumeState;
-#endif
-
 /* Struct to gather multiple nearby intersections. */
 typedef struct LocalIntersection {
   Ray ray;
@@ -1064,20 +870,6 @@ typedef struct LocalIntersection {
   float3 Ng[LOCAL_MAX_HITS];
 } LocalIntersection;
 
-/* Subsurface */
-
-/* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays {
-  PathState state[BSSRDF_MAX_HITS];
-
-  int num_rays;
-
-  struct Ray rays[BSSRDF_MAX_HITS];
-  float3 throughputs[BSSRDF_MAX_HITS];
-  struct PathRadianceState L_state[BSSRDF_MAX_HITS];
-} SubsurfaceIndirectRays;
-static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high.");
-
 /* Constant Kernel Data
  *
  * These structs are passed from CPU to various devices, and the struct layout
@@ -1128,7 +920,7 @@ typedef struct KernelCamera {
 
   /* render size */
   float width, height;
-  int resolution;
+  int pad1;
 
   /* anamorphic lens bokeh */
   float inv_aperture_ratio;
@@ -1169,11 +961,12 @@ typedef struct KernelFilm {
 
   int light_pass_flag;
   int pass_stride;
-  int use_light_pass;
 
   int pass_combined;
   int pass_depth;
+  int pass_position;
   int pass_normal;
+  int pass_roughness;
   int pass_motion;
 
   int pass_motion_weight;
@@ -1202,7 +995,13 @@ typedef struct KernelFilm {
 
   int pass_shadow;
   float pass_shadow_scale;
+
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+
   int filter_table_offset;
+
   int cryptomatte_passes;
   int cryptomatte_depth;
   int pass_cryptomatte;
@@ -1215,15 +1014,11 @@ typedef struct KernelFilm {
   float mist_inv_depth;
   float mist_falloff;
 
-  int pass_denoising_data;
-  int pass_denoising_clean;
-  int denoising_flags;
+  int pass_denoising_normal;
+  int pass_denoising_albedo;
 
   int pass_aov_color;
   int pass_aov_value;
-  int pass_aov_color_num;
-  int pass_aov_value_num;
-  int pad1, pad2, pad3;
 
   /* XYZ to rendering color space transform. float4 instead of float3 to
    * ensure consistent padding/alignment across devices. */
@@ -1234,19 +1029,54 @@ typedef struct KernelFilm {
 
   int pass_bake_primitive;
   int pass_bake_differential;
-  int pad;
 
-  /* viewport rendering options */
-  int display_pass_stride;
-  int display_pass_components;
-  int display_divide_pass_stride;
-  int use_display_exposure;
-  int use_display_pass_alpha;
+  int use_approximate_shadow_catcher;
 
-  int pad4, pad5, pad6;
+  int pad1, pad2, pad3;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
+typedef struct KernelFilmConvert {
+  int pass_offset;
+  int pass_stride;
+
+  int pass_use_exposure;
+  int pass_use_filter;
+
+  int pass_divide;
+  int pass_indirect;
+
+  int pass_combined;
+  int pass_sample_count;
+  int pass_adaptive_aux_buffer;
+  int pass_motion_weight;
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+  int pass_background;
+
+  float scale;
+  float exposure;
+  float scale_exposure;
+
+  int use_approximate_shadow_catcher;
+  int use_approximate_shadow_catcher_background;
+  int show_active_pixels;
+
+  /* Number of components to write to. */
+  int num_components;
+
+  /* Number of floats per pixel. When zero is the same as `num_components`.
+   * NOTE: Is ignored for half4 destination. */
+  int pixel_stride;
+
+  int is_denoised;
+
+  /* Padding. */
+  int pad1;
+} KernelFilmConvert;
+static_assert_align(KernelFilmConvert, 16);
+
 typedef struct KernelBackground {
   /* only shader index */
   int surface_shader;
@@ -1255,11 +1085,6 @@ typedef struct KernelBackground {
   int transparent;
   float transparent_roughness_squared_threshold;
 
-  /* ambient occlusion */
-  float ao_factor;
-  float ao_distance;
-  float ao_bounces_factor;
-
   /* portal sampling */
   float portal_weight;
   int num_portals;
@@ -1277,13 +1102,15 @@ typedef struct KernelBackground {
   int map_res_y;
 
   int use_mis;
+
+  /* Padding */
+  int pad1, pad2, pad3;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
 typedef struct KernelIntegrator {
   /* emission */
   int use_direct_light;
-  int use_ambient_occlusion;
   int num_distribution;
   int num_all_lights;
   float pdf_triangles;
@@ -1299,7 +1126,10 @@ typedef struct KernelIntegrator {
   int max_transmission_bounce;
   int max_volume_bounce;
 
+  /* AO bounces */
   int ao_bounces;
+  float ao_bounces_distance;
+  float ao_bounces_factor;
 
   /* transparent */
   int transparent_min_bounce;
@@ -1318,39 +1148,20 @@ typedef struct KernelIntegrator {
   float sample_clamp_direct;
   float sample_clamp_indirect;
 
-  /* branched path */
-  int branched;
-  int volume_decoupled;
-  int diffuse_samples;
-  int glossy_samples;
-  int transmission_samples;
-  int ao_samples;
-  int mesh_light_samples;
-  int subsurface_samples;
-  int sample_all_lights_direct;
-  int sample_all_lights_indirect;
-
   /* mis */
   int use_lamp_mis;
 
   /* sampler */
   int sampling_pattern;
-  int aa_samples;
-  int adaptive_min_samples;
-  int adaptive_step;
-  int adaptive_stop_per_sample;
-  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
   int volume_max_steps;
   float volume_step_rate;
-  int volume_samples;
-
-  int start_sample;
 
-  int max_closures;
+  int has_shadow_catcher;
 
+  /* padding */
   int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
@@ -1401,14 +1212,19 @@ typedef struct KernelTables {
 static_assert_align(KernelTables, 16);
 
 typedef struct KernelBake {
+  int use;
   int object_index;
   int tri_offset;
-  int type;
-  int pass_filter;
+  int pad1;
 } KernelBake;
 static_assert_align(KernelBake, 16);
 
 typedef struct KernelData {
+  uint kernel_features;
+  uint max_closures;
+  uint max_shaders;
+  uint pad;
+
   KernelCamera cam;
   KernelFilm film;
   KernelBackground background;
@@ -1485,11 +1301,10 @@ typedef struct KernelLight {
   int type;
   float co[3];
   int shader_id;
-  int samples;
   float max_bounces;
   float random;
   float strength[3];
-  float pad1;
+  float pad1, pad2;
   Transform tfm;
   Transform itfm;
   union {
@@ -1539,110 +1354,6 @@ typedef struct KernelShader {
 } KernelShader;
 static_assert_align(KernelShader, 16);
 
-/* Declarations required for split kernel */
-
-/* Macro for queues */
-/* Value marking queue's empty slot */
-#define QUEUE_EMPTY_SLOT -1
-
-/*
- * Queue 1 - Active rays
- * Queue 2 - Background queue
- * Queue 3 - Shadow ray cast kernel - AO
- * Queue 4 - Shadow ray cast kernel - direct lighting
- */
-
-/* Queue names */
-enum QueueNumber {
-  /* All active rays and regenerated rays are enqueued here. */
-  QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,
-
-  /* All
-   * 1. Background-hit rays,
-   * 2. Rays that has exited path-iteration but needs to update output buffer
-   * 3. Rays to be regenerated
-   * are enqueued here.
-   */
-  QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contribution for AO are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contributing for direct lighting are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-
-  /* Rays sorted according to shader->id */
-  QUEUE_SHADER_SORTED_RAYS,
-
-#ifdef __BRANCHED_PATH__
-  /* All rays moving to next iteration of the indirect loop for light */
-  QUEUE_LIGHT_INDIRECT_ITER,
-  /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
-  QUEUE_INACTIVE_RAYS,
-#  ifdef __VOLUME__
-  /* All rays moving to next iteration of the indirect loop for volumes */
-  QUEUE_VOLUME_INDIRECT_ITER,
-#  endif
-#  ifdef __SUBSURFACE__
-  /* All rays moving to next iteration of the indirect loop for subsurface */
-  QUEUE_SUBSURFACE_INDIRECT_ITER,
-#  endif
-#endif /* __BRANCHED_PATH__ */
-
-  NUM_QUEUES
-};
-
-/* We use RAY_STATE_MASK to get ray_state */
-#define RAY_STATE_MASK 0x0F
-#define RAY_FLAG_MASK 0xF0
-enum RayState {
-  RAY_INVALID = 0,
-  /* Denotes ray is actively involved in path-iteration. */
-  RAY_ACTIVE,
-  /* Denotes ray has completed processing all samples and is inactive. */
-  RAY_INACTIVE,
-  /* Denotes ray has exited path-iteration and needs to update output buffer. */
-  RAY_UPDATE_BUFFER,
-  /* Denotes ray needs to skip most surface shader work. */
-  RAY_HAS_ONLY_VOLUME,
-  /* Denotes ray has hit background */
-  RAY_HIT_BACKGROUND,
-  /* Denotes ray has to be regenerated */
-  RAY_TO_REGENERATE,
-  /* Denotes ray has been regenerated */
-  RAY_REGENERATED,
-  /* Denotes ray is moving to next iteration of the branched indirect loop */
-  RAY_LIGHT_INDIRECT_NEXT_ITER,
-  RAY_VOLUME_INDIRECT_NEXT_ITER,
-  RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
-
-  /* Ray flags */
-
-  /* Flags to denote that the ray is currently evaluating the branched indirect loop */
-  RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
-  RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
-  RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
-  RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT |
-                           RAY_BRANCHED_SUBSURFACE_INDIRECT),
-
-  /* Ray is evaluating an iteration of an indirect loop for another thread */
-  RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
-};
-
-#define ASSIGN_RAY_STATE(ray_state, ray_index, state) \
-  (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) \
-  ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
-#define ADD_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] | flag))
-#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
-#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
-
 /* Patches */
 
 #define PATCH_MAX_CONTROL_VERTS 16
@@ -1655,7 +1366,7 @@ enum RayState {
 
 /* Work Tiles */
 
-typedef struct WorkTile {
+typedef struct KernelWorkTile {
   uint x, y, w, h;
 
   uint start_sample;
@@ -1664,13 +1375,172 @@ typedef struct WorkTile {
   int offset;
   uint stride;
 
-  ccl_global float *buffer;
-} WorkTile;
+  /* Precalculated parameters used by init_from_camera kernel on GPU. */
+  int path_index_offset;
+  int work_size;
+} KernelWorkTile;
+
+/* Shader Evaluation.
+ *
+ * Position on a primitive on an object at which we want to evaluate the
+ * shader for e.g. mesh displacement or light importance map. */
+
+typedef struct KernelShaderEvalInput {
+  int object;
+  int prim;
+  float u, v;
+} KernelShaderEvalInput;
+static_assert_align(KernelShaderEvalInput, 16);
 
 /* Pre-computed sample table sizes for PMJ02 sampler. */
-#define NUM_PMJ_SAMPLES (64 * 64)
-#define NUM_PMJ_PATTERNS 48
+#define NUM_PMJ_DIVISIONS 32
+#define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS))
+#define NUM_PMJ_PATTERNS 1
 
-CCL_NAMESPACE_END
+/* Device kernels.
+ *
+ * Identifier for kernels that can be executed in device queues.
+ *
+ * Some implementation details.
+ *
+ * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
+ * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
+
+typedef enum DeviceKernel {
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL,
+
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_RESET,
+  DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
+
+  DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
+  DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,
+
+#define DECLARE_FILM_CONVERT_KERNEL(variant) \
+  DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA
+
+  DECLARE_FILM_CONVERT_KERNEL(DEPTH),
+  DECLARE_FILM_CONVERT_KERNEL(MIST),
+  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
+  DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
+  DECLARE_FILM_CONVERT_KERNEL(MOTION),
+  DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW),
+  DECLARE_FILM_CONVERT_KERNEL(COMBINED),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT4),
+
+#undef DECLARE_FILM_CONVERT_KERNEL
+
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y,
+
+  DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS,
+  DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO,
+  DEVICE_KERNEL_FILTER_COLOR_PREPROCESS,
+  DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS,
+
+  DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS,
+
+  DEVICE_KERNEL_PREFIX_SUM,
+
+  DEVICE_KERNEL_NUM,
+} DeviceKernel;
+
+enum {
+  DEVICE_KERNEL_INTEGRATOR_NUM = DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL + 1,
+};
+
+/* Kernel Features */
+
+enum KernelFeatureFlag : unsigned int {
+  /* Shader nodes. */
+  KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
+  KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
+  KERNEL_FEATURE_NODE_VOLUME = (1U << 2U),
+  KERNEL_FEATURE_NODE_HAIR = (1U << 3U),
+  KERNEL_FEATURE_NODE_BUMP = (1U << 4U),
+  KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U),
+  KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U),
+  KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U),
+
+  /* Use denoising kernels and output denoising passes. */
+  KERNEL_FEATURE_DENOISING = (1U << 8U),
+
+  /* Use path tracing kernels. */
+  KERNEL_FEATURE_PATH_TRACING = (1U << 9U),
 
-#endif /*  __KERNEL_TYPES_H__ */
+  /* BVH/sampling kernel features. */
+  KERNEL_FEATURE_HAIR = (1U << 10U),
+  KERNEL_FEATURE_HAIR_THICK = (1U << 11U),
+  KERNEL_FEATURE_OBJECT_MOTION = (1U << 12U),
+  KERNEL_FEATURE_CAMERA_MOTION = (1U << 13U),
+
+  /* Denotes whether baking functionality is needed. */
+  KERNEL_FEATURE_BAKING = (1U << 14U),
+
+  /* Use subsurface scattering materials. */
+  KERNEL_FEATURE_SUBSURFACE = (1U << 15U),
+
+  /* Use volume materials. */
+  KERNEL_FEATURE_VOLUME = (1U << 16U),
+
+  /* Use OpenSubdiv patch evaluation */
+  KERNEL_FEATURE_PATCH_EVALUATION = (1U << 17U),
+
+  /* Use Transparent shadows */
+  KERNEL_FEATURE_TRANSPARENT = (1U << 18U),
+
+  /* Use shadow catcher. */
+  KERNEL_FEATURE_SHADOW_CATCHER = (1U << 19U),
+
+  /* Per-uber shader usage flags. */
+  KERNEL_FEATURE_PRINCIPLED = (1U << 20U),
+
+  /* Light render passes. */
+  KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U),
+
+  /* Shadow render pass. */
+  KERNEL_FEATURE_SHADOW_PASS = (1U << 22U),
+};
+
+/* Shader node feature mask, to specialize shader evaluation for kernels. */
+
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
+  (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
+   KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
+   KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE \
+  (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE)
+#define KERNEL_FEATURE_NODE_MASK_VOLUME \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_DISPLACEMENT \
+  (KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE)
+#define KERNEL_FEATURE_NODE_MASK_BUMP KERNEL_FEATURE_NODE_MASK_DISPLACEMENT
+
+#define KERNEL_NODES_FEATURE(feature) ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
deleted file mode 100644
index f6b34be040e..00000000000
--- a/intern/cycles/kernel/kernel_volume.h
+++ /dev/null
@@ -1,1440 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
- * and precision issues.
- * todo: this value could be tweaked or turned into a probability to avoid unnecessary
- * work in volumes and subsurface scattering. */
-#define VOLUME_THROUGHPUT_EPSILON 1e-6f
-
-/* Events for probalistic scattering */
-
-typedef enum VolumeIntegrateResult {
-  VOLUME_PATH_SCATTERED = 0,
-  VOLUME_PATH_ATTENUATED = 1,
-  VOLUME_PATH_MISSED = 2
-} VolumeIntegrateResult;
-
-/* Volume shader properties
- *
- * extinction coefficient = absorption coefficient + scattering coefficient
- * sigma_t = sigma_a + sigma_s */
-
-typedef struct VolumeShaderCoefficients {
-  float3 sigma_t;
-  float3 sigma_s;
-  float3 emission;
-} VolumeShaderCoefficients;
-
-#ifdef __VOLUME__
-
-/* evaluate shader to get extinction coefficient at P */
-ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 P,
-                                                       float3 *extinction)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
-
-  if (sd->flag & SD_EXTINCTION) {
-    const float density = object_volume_density(kg, sd->object);
-    *extinction = sd->closure_transparent_extinction * density;
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-
-/* evaluate shader to get absorption, scattering and emission at P */
-ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
-                                            ShaderData *sd,
-                                            ccl_addr_space PathState *state,
-                                            float3 P,
-                                            VolumeShaderCoefficients *coeff)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
-
-  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION)))
-    return false;
-
-  coeff->sigma_s = zero_float3();
-  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
-  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
-
-  if (sd->flag & SD_SCATTER) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_VOLUME(sc->type))
-        coeff->sigma_s += sc->weight;
-    }
-  }
-
-  const float density = object_volume_density(kg, sd->object);
-  coeff->sigma_s *= density;
-  coeff->sigma_t *= density;
-  coeff->emission *= density;
-
-  return true;
-}
-
-#endif /* __VOLUME__ */
-
-ccl_device float3 volume_color_transmittance(float3 sigma, float t)
-{
-  return exp3(-sigma * t);
-}
-
-ccl_device float kernel_volume_channel_get(float3 value, int channel)
-{
-  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
-}
-
-#ifdef __VOLUME__
-
-ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
-{
-  float step_size = FLT_MAX;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    bool heterogeneous = false;
-
-    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
-      heterogeneous = true;
-    }
-    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
-      /* We want to render world or objects without any volume grids
-       * as homogeneous, but can only verify this at run-time since other
-       * heterogeneous volume objects may be using the same shader. */
-      int object = stack[i].object;
-      if (object != OBJECT_NONE) {
-        int object_flag = kernel_tex_fetch(__object_flag, object);
-        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
-          heterogeneous = true;
-        }
-      }
-    }
-
-    if (heterogeneous) {
-      float object_step_size = object_volume_step_size(kg, stack[i].object);
-      object_step_size *= kernel_data.integrator.volume_step_rate;
-      step_size = fminf(object_step_size, step_size);
-    }
-  }
-
-  return step_size;
-}
-
-ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
-{
-  if (kernel_data.integrator.num_all_lights == 0)
-    return 0;
-
-  int method = -1;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    if (shader_flag & SD_VOLUME_MIS) {
-      return SD_VOLUME_MIS;
-    }
-    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
-      if (method == 0)
-        return SD_VOLUME_MIS;
-
-      method = SD_VOLUME_EQUIANGULAR;
-    }
-    else {
-      if (method == SD_VOLUME_EQUIANGULAR)
-        return SD_VOLUME_MIS;
-
-      method = 0;
-    }
-  }
-
-  return method;
-}
-
-ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               const float object_step_size,
-                                               float t,
-                                               float *step_size,
-                                               float *step_shade_offset,
-                                               float *steps_offset)
-{
-  const int max_steps = kernel_data.integrator.volume_max_steps;
-  float step = min(object_step_size, t);
-
-  /* compute exact steps in advance for malloc */
-  if (t > max_steps * step) {
-    step = t / (float)max_steps;
-  }
-
-  *step_size = step;
-
-  /* Perform shading at this offset within a step, to integrate over
-   * over the entire step segment. */
-  *step_shade_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4);
-
-  /* Shift starting point of all segment by this random amount to avoid
-   * banding artifacts from the volume bounding shape. */
-  *steps_offset = path_state_rng_1D_hash(kg, state, 0x3d22c7b3);
-}
-
-/* Volume Shadows
- *
- * These functions are used to attenuate shadow rays to lights. Both absorption
- * and scattering will block light, represented by the extinction coefficient. */
-
-/* homogeneous volume: assume shader evaluation at the starts gives
- * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state,
-                                                 Ray *ray,
-                                                 ShaderData *sd,
-                                                 float3 *throughput)
-{
-  float3 sigma_t = zero_float3();
-
-  if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
-    *throughput *= volume_color_transmittance(sigma_t, ray->t);
-}
-
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   Ray *ray,
-                                                   ShaderData *sd,
-                                                   float3 *throughput,
-                                                   const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * For shadows we do not offset all segments, since the starting point is
-   * already a random distance inside the volume. It also appears to create
-   * banding artifacts for unknown reasons. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, unused;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &unused);
-  const float steps_offset = 1.0f;
-
-  /* compute extinction at the start */
-  float t = 0.0f;
-
-  float3 sum = zero_float3();
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    float3 sigma_t = zero_float3();
-
-    /* compute attenuation over segment */
-    if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-      /* Compute expf() only for every Nth step, to save some calculations
-       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
-       * check then. */
-      sum += (-sigma_t * dt);
-      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
-        tp = *throughput * exp3(sum);
-
-        /* stop if nearly all light is blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON)
-          break;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t) {
-      /* Update throughput in case we haven't done it above */
-      tp = *throughput * exp3(sum);
-      break;
-    }
-  }
-
-  *throughput = tp;
-}
-
-/* get the volume attenuation over line segment defined by ray, with the
- * assumption that there are no surfaces blocking light between the endpoints */
-#  if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
-                                            ShaderData *shadow_sd,
-                                            ccl_addr_space PathState *state,
-                                            Ray *ray,
-                                            float3 *throughput)
-{
-  optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
-}
-extern "C" __device__ void __direct_callable__kernel_volume_shadow(
-#  else
-ccl_device_noinline void kernel_volume_shadow(
-#  endif
-    KernelGlobals *kg,
-    ShaderData *shadow_sd,
-    ccl_addr_space PathState *state,
-    Ray *ray,
-    float3 *throughput)
-{
-  shader_setup_from_volume(kg, shadow_sd, ray);
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  if (step_size != FLT_MAX)
-    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
-  else
-    kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
-}
-
-#endif /* __VOLUME__ */
-
-/* Equi-angular sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf)
-{
-  float t = ray->t;
-
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
-  if (UNLIKELY(theta_b == theta_a)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return min(t, delta + t_); /* min is only for float precision errors */
-}
-
-ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
-{
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    return 0.0f;
-  }
-
-  float t = ray->t;
-  float t_ = sample_t - delta;
-
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  if (UNLIKELY(theta_b == theta_a)) {
-    return 0.0f;
-  }
-
-  float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return pdf;
-}
-
-/* Distance sampling */
-
-ccl_device float kernel_volume_distance_sample(
-    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
-{
-  /* xi is [0, 1[ so log(0) should never happen, division by zero is
-   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
-  float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel);
-
-  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
-
-  *transmittance = volume_color_transmittance(sigma_t, sample_t);
-  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
-
-  /* todo: optimization: when taken together with hit/miss decision,
-   * the full_transmittance cancels out drops out and xi does not
-   * need to be remapped */
-
-  return sample_t;
-}
-
-ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
-{
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
-
-  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
-}
-
-/* Emission */
-
-ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff,
-                                                   int closure_flag,
-                                                   float3 transmittance,
-                                                   float t)
-{
-  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
-   * this goes to E * t as sigma_t goes to zero
-   *
-   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
-  float3 emission = coeff->emission;
-
-  if (closure_flag & SD_EXTINCTION) {
-    float3 sigma_t = coeff->sigma_t;
-
-    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
-    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
-    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
-  }
-  else
-    emission *= t;
-
-  return emission;
-}
-
-/* Volume Path */
-
-ccl_device int kernel_volume_sample_channel(float3 albedo,
-                                            float3 throughput,
-                                            float rand,
-                                            float3 *pdf)
-{
-  /* Sample color channel proportional to throughput and single scattering
-   * albedo, to significantly reduce noise with many bounce, following:
-   *
-   * "Practical and Controllable Subsurface Scattering for Production Path
-   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-  float3 weights = fabs(throughput * albedo);
-  float sum_weights = weights.x + weights.y + weights.z;
-  float3 weights_pdf;
-
-  if (sum_weights > 0.0f) {
-    weights_pdf = weights / sum_weights;
-  }
-  else {
-    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
-  }
-
-  *pdf = weights_pdf;
-
-  /* OpenCL does not support -> on float3, so don't use pdf->x. */
-  if (rand < weights_pdf.x) {
-    return 0;
-  }
-  else if (rand < weights_pdf.x + weights_pdf.y) {
-    return 1;
-  }
-  else {
-    return 2;
-  }
-}
-
-#ifdef __VOLUME__
-
-/* homogeneous volume: assume shader evaluation at the start gives
- * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-                                    ccl_addr_space PathState *state,
-                                    Ray *ray,
-                                    ShaderData *sd,
-                                    PathRadiance *L,
-                                    ccl_addr_space float3 *throughput,
-                                    bool probalistic_scatter)
-{
-  VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-  if (!volume_shader_sample(kg, sd, state, ray->P, &coeff))
-    return VOLUME_PATH_MISSED;
-
-  int closure_flag = sd->flag;
-  float t = ray->t;
-  float3 new_tp;
-
-#  ifdef __VOLUME_SCATTER__
-  /* randomly scatter, and if we do t is shortened */
-  if (closure_flag & SD_SCATTER) {
-    /* Sample channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf);
-
-    /* decide if we will hit or miss */
-    bool scatter = true;
-    float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    if (probalistic_scatter) {
-      float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-      float sample_transmittance = expf(-sample_sigma_t * t);
-
-      if (1.0f - xi >= sample_transmittance) {
-        scatter = true;
-
-        /* rescale random number so we can reuse it */
-        xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-      }
-      else
-        scatter = false;
-    }
-
-    if (scatter) {
-      /* scattering */
-      float3 pdf;
-      float3 transmittance;
-      float sample_t;
-
-      /* distance sampling */
-      sample_t = kernel_volume_distance_sample(
-          ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf);
-
-      /* modify pdf for hit/miss decision */
-      if (probalistic_scatter)
-        pdf *= one_float3() - volume_color_transmittance(coeff.sigma_t, t);
-
-      new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf);
-      t = sample_t;
-    }
-    else {
-      /* no scattering */
-      float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-      float pdf = dot(channel_pdf, transmittance);
-      new_tp = *throughput * transmittance / pdf;
-    }
-  }
-  else
-#  endif
-      if (closure_flag & SD_EXTINCTION) {
-    /* absorption only, no sampling needed */
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-    new_tp = *throughput * transmittance;
-  }
-  else {
-    new_tp = *throughput;
-  }
-
-  /* integrate emission attenuated by extinction */
-  if (L && (closure_flag & SD_EMISSION)) {
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
-    float3 emission = kernel_volume_emission_integrate(
-        &coeff, closure_flag, transmittance, ray->t);
-    path_radiance_accum_emission(kg, L, state, *throughput, emission);
-  }
-
-  /* modify throughput */
-  if (closure_flag & SD_EXTINCTION) {
-    *throughput = new_tp;
-
-    /* prepare to scatter to new direction */
-    if (t < ray->t) {
-      /* adjust throughput and move to new location */
-      sd->P = ray->P + t * ray->D;
-
-      return VOLUME_PATH_SCATTERED;
-    }
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* heterogeneous volume distance sampling: integrate stepping through the
- * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probabilistically scatter or get transmitted through
- * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space float3 *throughput,
-                                               const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * Using a different step offset for the first step avoids banding artifacts. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, steps_offset;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-  /* compute coefficients at the start */
-  float t = 0.0f;
-  float3 accum_transmittance = one_float3();
-
-  /* pick random color channel, we use the Veach one-sample
-   * model with balance heuristic for the channels */
-  float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-  float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-  bool has_scatter = false;
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 new_tp;
-      float3 transmittance;
-      bool scatter = false;
-
-      /* distance sampling */
-#  ifdef __VOLUME_SCATTER__
-      if ((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) {
-        has_scatter = true;
-
-        /* Sample channel, use MIS with balance heuristic. */
-        float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-        float3 channel_pdf;
-        int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf);
-
-        /* compute transmittance over full step */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-
-        /* decide if we will scatter or continue */
-        float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
-
-        if (1.0f - xi >= sample_transmittance) {
-          /* compute sampling distance */
-          float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-          float new_dt = -logf(1.0f - xi) / sample_sigma_t;
-          new_t = t + new_dt;
-
-          /* transmittance and pdf */
-          float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
-          float3 pdf = coeff.sigma_t * new_transmittance;
-
-          /* throughput */
-          new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf);
-          scatter = true;
-        }
-        else {
-          /* throughput */
-          float pdf = dot(channel_pdf, transmittance);
-          new_tp = tp * transmittance / pdf;
-
-          /* remap xi so we can reuse it and keep thing stratified */
-          xi = 1.0f - (1.0f - xi) / sample_transmittance;
-        }
-      }
-      else
-#  endif
-          if (closure_flag & SD_EXTINCTION) {
-        /* absorption only, no sampling needed */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-        new_tp = tp * transmittance;
-      }
-      else {
-        transmittance = zero_float3();
-        new_tp = tp;
-      }
-
-      /* integrate emission attenuated by absorption */
-      if (L && (closure_flag & SD_EMISSION)) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        path_radiance_accum_emission(kg, L, state, tp, emission);
-      }
-
-      /* modify throughput */
-      if (closure_flag & SD_EXTINCTION) {
-        tp = new_tp;
-
-        /* stop if nearly all light blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON) {
-          tp = zero_float3();
-          break;
-        }
-      }
-
-      /* prepare to scatter to new direction */
-      if (scatter) {
-        /* adjust throughput and move to new location */
-        sd->P = ray->P + new_t * ray->D;
-        *throughput = tp;
-
-        return VOLUME_PATH_SCATTERED;
-      }
-      else {
-        /* accumulate transmittance */
-        accum_transmittance *= transmittance;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-  }
-
-  *throughput = tp;
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints. distance sampling is used to decide if we will
- * scatter or not. */
-ccl_device_noinline_cpu VolumeIntegrateResult
-kernel_volume_integrate(KernelGlobals *kg,
-                        ccl_addr_space PathState *state,
-                        ShaderData *sd,
-                        Ray *ray,
-                        PathRadiance *L,
-                        ccl_addr_space float3 *throughput,
-                        float step_size)
-{
-  shader_setup_from_volume(kg, sd, ray);
-
-  if (step_size != FLT_MAX)
-    return kernel_volume_integrate_heterogeneous_distance(
-        kg, state, ray, sd, L, throughput, step_size);
-  else
-    return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
-}
-
-#  ifndef __SPLIT_KERNEL__
-/* Decoupled Volume Sampling
- *
- * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then later be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media"
- *
- * On the GPU this is only supported (but currently not enabled)
- * for homogeneous volumes (1 step), due to
- * no support for malloc/free and too much stack usage with a fix size array. */
-
-typedef struct VolumeStep {
-  float3 sigma_s;             /* scatter coefficient */
-  float3 sigma_t;             /* extinction coefficient */
-  float3 accum_transmittance; /* accumulated transmittance including this step */
-  float3 cdf_distance;        /* cumulative density function for distance sampling */
-  float t;                    /* distance at end of this step */
-  float shade_t;              /* jittered distance where shading was done in step */
-  int closure_flag;           /* shader evaluation closure flags */
-} VolumeStep;
-
-typedef struct VolumeSegment {
-  VolumeStep stack_step; /* stack storage for homogeneous step, to avoid malloc */
-  VolumeStep *steps;     /* recorded steps */
-  int numsteps;          /* number of steps */
-  int closure_flag;      /* accumulated closure flags from all steps */
-
-  float3 accum_emission;      /* accumulated emission at end of segment */
-  float3 accum_transmittance; /* accumulated transmittance at end of segment */
-  float3 accum_albedo;        /* accumulated average albedo over segment */
-
-  int sampling_method; /* volume sampling method */
-} VolumeSegment;
-
-/* record volume steps to the end of the volume.
- *
- * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probabilistically
- * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratified distance samples up to that transmittance */
-#    ifdef __VOLUME_DECOUPLED__
-ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
-                                               PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               VolumeSegment *segment,
-                                               const float object_step_size)
-{
-  /* prepare for volume stepping */
-  int max_steps;
-  float step_size, step_shade_offset, steps_offset;
-
-  if (object_step_size != FLT_MAX) {
-    max_steps = kernel_data.integrator.volume_max_steps;
-    kernel_volume_step_init(
-        kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-#      ifdef __KERNEL_CPU__
-    /* NOTE: For the branched path tracing it's possible to have direct
-     * and indirect light integration both having volume segments allocated.
-     * We detect this using index in the pre-allocated memory. Currently we
-     * only support two segments allocated at a time, if more needed some
-     * modifications to the KernelGlobals will be needed.
-     *
-     * This gives us restrictions that decoupled record should only happen
-     * in the stack manner, meaning if there's subsequent call of decoupled
-     * record it'll need to free memory before its caller frees memory.
-     */
-    const int index = kg->decoupled_volume_steps_index;
-    assert(index < sizeof(kg->decoupled_volume_steps) / sizeof(*kg->decoupled_volume_steps));
-    if (kg->decoupled_volume_steps[index] == NULL) {
-      kg->decoupled_volume_steps[index] = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-    }
-    segment->steps = kg->decoupled_volume_steps[index];
-    ++kg->decoupled_volume_steps_index;
-#      else
-    segment->steps = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-#      endif
-  }
-  else {
-    max_steps = 1;
-    step_size = ray->t;
-    step_shade_offset = 0.0f;
-    steps_offset = 1.0f;
-    segment->steps = &segment->stack_step;
-  }
-
-  /* init accumulation variables */
-  float3 accum_emission = zero_float3();
-  float3 accum_transmittance = one_float3();
-  float3 accum_albedo = zero_float3();
-  float3 cdf_distance = zero_float3();
-  float t = 0.0f;
-
-  segment->numsteps = 0;
-  segment->closure_flag = 0;
-  bool is_last_step_empty = false;
-
-  VolumeStep *step = segment->steps;
-
-  for (int i = 0; i < max_steps; i++, step++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 sigma_t = coeff.sigma_t;
-
-      /* compute average albedo for channel sampling */
-      if (closure_flag & SD_SCATTER) {
-        accum_albedo += (dt / ray->t) * safe_divide_color(coeff.sigma_s, sigma_t);
-      }
-
-      /* compute accumulated transmittance */
-      float3 transmittance = volume_color_transmittance(sigma_t, dt);
-
-      /* compute emission attenuated by absorption */
-      if (closure_flag & SD_EMISSION) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        accum_emission += accum_transmittance * emission;
-      }
-
-      accum_transmittance *= transmittance;
-
-      /* compute pdf for distance sampling */
-      float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s;
-      cdf_distance = cdf_distance + pdf_distance;
-
-      /* write step data */
-      step->sigma_t = sigma_t;
-      step->sigma_s = coeff.sigma_s;
-      step->closure_flag = closure_flag;
-
-      segment->closure_flag |= closure_flag;
-
-      is_last_step_empty = false;
-      segment->numsteps++;
-    }
-    else {
-      if (is_last_step_empty) {
-        /* consecutive empty step, merge */
-        step--;
-      }
-      else {
-        /* store empty step */
-        step->sigma_t = zero_float3();
-        step->sigma_s = zero_float3();
-        step->closure_flag = 0;
-
-        segment->numsteps++;
-        is_last_step_empty = true;
-      }
-    }
-
-    step->accum_transmittance = accum_transmittance;
-    step->cdf_distance = cdf_distance;
-    step->t = new_t;
-    step->shade_t = t + dt * step_shade_offset;
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-
-    /* stop if nearly all light blocked */
-    if (accum_transmittance.x < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.y < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.z < VOLUME_THROUGHPUT_EPSILON)
-      break;
-  }
-
-  /* store total emission and transmittance */
-  segment->accum_emission = accum_emission;
-  segment->accum_transmittance = accum_transmittance;
-  segment->accum_albedo = accum_albedo;
-
-  /* normalize cumulative density function for distance sampling */
-  VolumeStep *last_step = segment->steps + segment->numsteps - 1;
-
-  if (!is_zero(last_step->cdf_distance)) {
-    VolumeStep *step = &segment->steps[0];
-    int numsteps = segment->numsteps;
-    float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance);
-
-    for (int i = 0; i < numsteps; i++, step++)
-      step->cdf_distance *= inv_cdf_distance_sum;
-  }
-}
-
-ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
-{
-  if (segment->steps != &segment->stack_step) {
-#      ifdef __KERNEL_CPU__
-    /* NOTE: We only allow free last allocated segment.
-     * No random order of alloc/free is supported.
-     */
-    assert(kg->decoupled_volume_steps_index > 0);
-    assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
-    --kg->decoupled_volume_steps_index;
-#      else
-    free(segment->steps);
-#      endif
-  }
-}
-#    endif /* __VOLUME_DECOUPLED__ */
-
-/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching.
- *
- * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
-ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(KernelGlobals *kg,
-                                                                 PathState *state,
-                                                                 Ray *ray,
-                                                                 ShaderData *sd,
-                                                                 float3 *throughput,
-                                                                 float rphase,
-                                                                 float rscatter,
-                                                                 const VolumeSegment *segment,
-                                                                 const float3 *light_P,
-                                                                 bool probalistic_scatter)
-{
-  kernel_assert(segment->closure_flag & SD_SCATTER);
-
-  /* Sample color channel, use MIS with balance heuristic. */
-  float3 channel_pdf;
-  int channel = kernel_volume_sample_channel(
-      segment->accum_albedo, *throughput, rphase, &channel_pdf);
-
-  float xi = rscatter;
-
-  /* probabilistic scattering decision based on transmittance */
-  if (probalistic_scatter) {
-    float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
-
-    if (1.0f - xi >= sample_transmittance) {
-      /* rescale random number so we can reuse it */
-      xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-    }
-    else {
-      *throughput /= sample_transmittance;
-      return VOLUME_PATH_MISSED;
-    }
-  }
-
-  VolumeStep *step;
-  float3 transmittance;
-  float pdf, sample_t;
-  float mis_weight = 1.0f;
-  bool distance_sample = true;
-  bool use_mis = false;
-
-  if (segment->sampling_method && light_P) {
-    if (segment->sampling_method == SD_VOLUME_MIS) {
-      /* multiple importance sample: randomly pick between
-       * equiangular and distance sampling strategy */
-      if (xi < 0.5f) {
-        xi *= 2.0f;
-      }
-      else {
-        xi = (xi - 0.5f) * 2.0f;
-        distance_sample = false;
-      }
-
-      use_mis = true;
-    }
-    else {
-      /* only equiangular sampling */
-      distance_sample = false;
-    }
-  }
-
-  /* distance sampling */
-  if (distance_sample) {
-    /* find step in cdf */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float prev_cdf = 0.0f;
-      float step_cdf = 1.0f;
-      float3 prev_cdf_distance = zero_float3();
-
-      for (int i = 0;; i++, step++) {
-        /* todo: optimize using binary search */
-        step_cdf = kernel_volume_channel_get(step->cdf_distance, channel);
-
-        if (xi < step_cdf || i == segment->numsteps - 1)
-          break;
-
-        prev_cdf = step_cdf;
-        prev_t = step->t;
-        prev_cdf_distance = step->cdf_distance;
-      }
-
-      /* remap xi so we can reuse it */
-      xi = (xi - prev_cdf) / (step_cdf - prev_cdf);
-
-      /* pdf for picking step */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-
-    /* sample distance and compute transmittance */
-    float3 distance_pdf;
-    sample_t = prev_t + kernel_volume_distance_sample(
-                            step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-
-    /* modify pdf for hit/miss decision */
-    if (probalistic_scatter)
-      distance_pdf *= one_float3() - segment->accum_transmittance;
-
-    pdf = dot(channel_pdf, distance_pdf * step_pdf_distance);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
-      mis_weight = 2.0f * power_heuristic(pdf, equi_pdf);
-    }
-  }
-  /* equi-angular sampling */
-  else {
-    /* sample distance */
-    sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
-
-    /* find step in which sampled distance is located */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float3 prev_cdf_distance = zero_float3();
-
-      int numsteps = segment->numsteps;
-      int high = numsteps - 1;
-      int low = 0;
-      int mid;
-
-      while (low < high) {
-        mid = (low + high) >> 1;
-
-        if (sample_t < step[mid].t)
-          high = mid;
-        else if (sample_t >= step[mid + 1].t)
-          low = mid + 1;
-        else {
-          /* found our interval in step[mid] .. step[mid+1] */
-          prev_t = step[mid].t;
-          prev_cdf_distance = step[mid].cdf_distance;
-          step += mid + 1;
-          break;
-        }
-      }
-
-      if (low >= numsteps - 1) {
-        prev_t = step[numsteps - 1].t;
-        prev_cdf_distance = step[numsteps - 1].cdf_distance;
-        step += numsteps - 1;
-      }
-
-      /* pdf for picking step with distance sampling */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-    float step_sample_t = sample_t - prev_t;
-
-    /* compute transmittance */
-    transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
-      float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance);
-      mis_weight = 2.0f * power_heuristic(pdf, distance_pdf);
-    }
-  }
-  if (sample_t < 0.0f || pdf == 0.0f) {
-    return VOLUME_PATH_MISSED;
-  }
-
-  /* compute transmittance up to this step */
-  if (step != segment->steps)
-    transmittance *= (step - 1)->accum_transmittance;
-
-  /* modify throughput */
-  *throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
-
-  /* evaluate shader to create closures at shading point */
-  if (segment->numsteps > 1) {
-    sd->P = ray->P + step->shade_t * ray->D;
-
-    VolumeShaderCoefficients coeff;
-    volume_shader_sample(kg, sd, state, sd->P, &coeff);
-  }
-
-  /* move to new position */
-  sd->P = ray->P + sample_t * ray->D;
-
-  return VOLUME_PATH_SCATTERED;
-}
-#  endif /* __SPLIT_KERNEL */
-
-/* decide if we need to use decoupled or not */
-ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg,
-                                            bool heterogeneous,
-                                            bool direct,
-                                            int sampling_method)
-{
-  /* decoupled ray marching for heterogeneous volumes not supported on the GPU,
-   * which also means equiangular and multiple importance sampling is not
-   * support for that case */
-  if (!kernel_data.integrator.volume_decoupled)
-    return false;
-
-#  ifdef __KERNEL_GPU__
-  if (heterogeneous)
-    return false;
-#  endif
-
-  /* equiangular and multiple importance sampling only implemented for decoupled */
-  if (sampling_method != 0)
-    return true;
-
-  /* for all light sampling use decoupled, reusing shader evaluations is
-   * typically faster in that case */
-  if (direct)
-    return kernel_data.integrator.sample_all_lights_direct;
-  else
-    return kernel_data.integrator.sample_all_lights_indirect;
-}
-
-/* Volume Stack
- *
- * This is an array of object/shared ID's that the current segment of the path
- * is inside of. */
-
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
-                                         ShaderData *stack_sd,
-                                         ccl_addr_space const PathState *state,
-                                         ccl_addr_space const Ray *ray,
-                                         ccl_addr_space VolumeStack *stack)
-{
-  /* NULL ray happens in the baker, does it need proper initialization of
-   * camera in volume?
-   */
-  if (!kernel_data.cam.is_inside_volume || ray == NULL) {
-    /* Camera is guaranteed to be in the air, only take background volume
-     * into account in this case.
-     */
-    if (kernel_data.background.volume_shader != SHADER_NONE) {
-      stack[0].shader = kernel_data.background.volume_shader;
-      stack[0].object = PRIM_NONE;
-      stack[1].shader = SHADER_NONE;
-    }
-    else {
-      stack[0].shader = SHADER_NONE;
-    }
-    return;
-  }
-
-  kernel_assert(state->flag & PATH_RAY_CAMERA);
-
-  Ray volume_ray = *ray;
-  volume_ray.t = FLT_MAX;
-
-  const uint visibility = (state->flag & PATH_RAY_ALL_VISIBILITY);
-  int stack_index = 0, enclosed_index = 0;
-
-#  ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
-  if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      if (stack_sd->flag & SD_BACKFACING) {
-        bool need_add = true;
-        for (int i = 0; i < enclosed_index && need_add; ++i) {
-          /* If ray exited the volume and never entered to that volume
-           * it means that camera is inside such a volume.
-           */
-          if (enclosed_volumes[i] == stack_sd->object) {
-            need_add = false;
-          }
-        }
-        for (int i = 0; i < stack_index && need_add; ++i) {
-          /* Don't add intersections twice. */
-          if (stack[i].object == stack_sd->object) {
-            need_add = false;
-            break;
-          }
-        }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
-          stack[stack_index].object = stack_sd->object;
-          stack[stack_index].shader = stack_sd->shader;
-          ++stack_index;
-        }
-      }
-      else {
-        /* If ray from camera enters the volume, this volume shouldn't
-         * be added to the stack on exit.
-         */
-        enclosed_volumes[enclosed_index++] = stack_sd->object;
-      }
-    }
-  }
-#  else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
-  int step = 0;
-
-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
-    Intersection isect;
-    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
-      break;
-    }
-
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    if (stack_sd->flag & SD_BACKFACING) {
-      /* If ray exited the volume and never entered to that volume
-       * it means that camera is inside such a volume.
-       */
-      bool need_add = true;
-      for (int i = 0; i < enclosed_index && need_add; ++i) {
-        /* If ray exited the volume and never entered to that volume
-         * it means that camera is inside such a volume.
-         */
-        if (enclosed_volumes[i] == stack_sd->object) {
-          need_add = false;
-        }
-      }
-      for (int i = 0; i < stack_index && need_add; ++i) {
-        /* Don't add intersections twice. */
-        if (stack[i].object == stack_sd->object) {
-          need_add = false;
-          break;
-        }
-      }
-      if (need_add) {
-        stack[stack_index].object = stack_sd->object;
-        stack[stack_index].shader = stack_sd->shader;
-        ++stack_index;
-      }
-    }
-    else {
-      /* If ray from camera enters the volume, this volume shouldn't
-       * be added to the stack on exit.
-       */
-      enclosed_volumes[enclosed_index++] = stack_sd->object;
-    }
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    ++step;
-  }
-#  endif
-  /* stack_index of 0 means quick checks outside of the kernel gave false
-   * positive, nothing to worry about, just we've wasted quite a few of
-   * ticks just to come into conclusion that camera is in the air.
-   *
-   * In this case we're doing the same above -- check whether background has
-   * volume.
-   */
-  if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
-    stack[0].shader = kernel_data.background.volume_shader;
-    stack[0].object = OBJECT_NONE;
-    stack[1].shader = SHADER_NONE;
-  }
-  else {
-    stack[stack_index].shader = SHADER_NONE;
-  }
-}
-
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ccl_addr_space VolumeStack *stack)
-{
-  /* todo: we should have some way for objects to indicate if they want the
-   * world shader to work inside them. excluding it by default is problematic
-   * because non-volume objects can't be assumed to be closed manifolds */
-
-  if (!(sd->flag & SD_HAS_VOLUME))
-    return;
-
-  if (sd->flag & SD_BACKFACING) {
-    /* exit volume object: remove from stack */
-    for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-      if (stack[i].object == sd->object) {
-        /* shift back next stack entries */
-        do {
-          stack[i] = stack[i + 1];
-          i++;
-        } while (stack[i].shader != SHADER_NONE);
-
-        return;
-      }
-    }
-  }
-  else {
-    /* enter volume object: add to stack */
-    int i;
-
-    for (i = 0; stack[i].shader != SHADER_NONE; i++) {
-      /* already in the stack? then we have nothing to do */
-      if (stack[i].object == sd->object)
-        return;
-    }
-
-    /* if we exceed the stack limit, ignore */
-    if (i >= VOLUME_STACK_SIZE - 1)
-      return;
-
-    /* add to the end of the stack */
-    stack[i].shader = sd->shader;
-    stack[i].object = sd->object;
-    stack[i + 1].shader = SHADER_NONE;
-  }
-}
-
-#  ifdef __SUBSURFACE__
-ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
-                                                          ShaderData *stack_sd,
-                                                          Ray *ray,
-                                                          ccl_addr_space VolumeStack *stack)
-{
-  kernel_assert(kernel_data.integrator.use_volumes);
-
-  Ray volume_ray = *ray;
-
-#    ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
-  if (num_hits > 0) {
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-    }
-  }
-#    else
-  Intersection isect;
-  int step = 0;
-  float3 Pend = ray->P + ray->D * ray->t;
-  while (step < 2 * VOLUME_STACK_SIZE &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    if (volume_ray.t != FLT_MAX) {
-      volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
-    }
-    ++step;
-  }
-#    endif
-}
-#  endif
-
-/* Clean stack after the last bounce.
- *
- * It is expected that all volumes are closed manifolds, so at the time when ray
- * hits nothing (for example, it is a last bounce which goes to environment) the
- * only expected volume in the stack is the world's one. All the rest volume
- * entries should have been exited already.
- *
- * This isn't always true because of ray intersection precision issues, which
- * could lead us to an infinite non-world volume in the stack, causing render
- * artifacts.
- *
- * Use this function after the last bounce to get rid of all volumes apart from
- * the world's one after the last bounce to avoid render artifacts.
- */
-ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 ccl_addr_space VolumeStack *volume_stack)
-{
-  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    /* Keep the world's volume in stack. */
-    volume_stack[1].shader = SHADER_NONE;
-  }
-  else {
-    volume_stack[0].shader = SHADER_NONE;
-  }
-}
-
-#endif /* __VOLUME__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
  */
 
 /* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       uint global_work_index,
                                       ccl_private uint *x,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
+#if 0
+  /* Keep threads for the same sample together. */
   uint tile_pixels = tile->w * tile->h;
   uint sample_offset = global_work_index / tile_pixels;
   uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
 
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
   *sample = tile->start_sample + sample_offset;
 }
 
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
-                                   ccl_global uint *work_pools,
-                                   uint total_work_size,
-                                   uint ray_index,
-                                   ccl_private uint *global_work_index)
-{
-  /* With a small amount of work there may be more threads than work due to
-   * rounding up of global size, stop such threads immediately. */
-  if (ray_index >= total_work_size) {
-    return false;
-  }
-
-  /* Increase atomic work index counter in pool. */
-  uint pool = ray_index / WORK_POOL_SIZE;
-  uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
-  /* Map per-pool work index to a global work index. */
-  uint global_size = ccl_global_size(0) * ccl_global_size(1);
-  kernel_assert(global_size % WORK_POOL_SIZE == 0);
-  kernel_assert(ray_index < global_size);
-
-  *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
-                       (work_index % WORK_POOL_SIZE);
-
-  /* Test if all work for this pool is done. */
-  return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
-{
-  bool got_work = false;
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    do {
-      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-      if (got_work) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                       kernel_data.film.pass_adaptive_aux_buffer);
-        if ((*aux).w == 0.0f) {
-          break;
-        }
-      }
-    } while (got_work);
-  }
-  else {
-    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-  }
-  return got_work;
-}
-#endif
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h
index 410218d91d4..9d379495629 100644
--- a/intern/cycles/kernel/kernel_write_passes.h
+++ b/intern/cycles/kernel/kernel_write_passes.h
@@ -14,23 +14,25 @@
  * limitations under the License.
  */
 
-#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#pragma once
+
+#ifdef __KERNEL_GPU__
 #  define __ATOMIC_PASS_WRITE__
 #endif
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
+ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
 {
-  ccl_global float *buf = buffer;
 #ifdef __ATOMIC_PASS_WRITE__
-  atomic_add_and_fetch_float(buf, value);
+  atomic_add_and_fetch_float(buffer, value);
 #else
-  *buf += value;
+  *buffer += value;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
+                                                float3 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -41,12 +43,14 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
   atomic_add_and_fetch_float(buf_y, value.y);
   atomic_add_and_fetch_float(buf_z, value.z);
 #else
-  ccl_global float3 *buf = (ccl_global float3 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
+                                                float4 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -59,37 +63,26 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
   atomic_add_and_fetch_float(buf_z, value.z);
   atomic_add_and_fetch_float(buf_w, value.w);
 #else
-  ccl_global float4 *buf = (ccl_global float4 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+  buffer[3] += value.w;
 #endif
 }
 
-#ifdef __DENOISING_FEATURES__
-ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float(buffer, value);
-
-  /* The online one-pass variance update that's used for the megakernel can't easily be implemented
-   * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-  kernel_write_pass_float(buffer + 1, value * value);
+  return *buffer;
 }
 
-#  ifdef __ATOMIC_PASS_WRITE__
-#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
-#  else
-ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
 {
-  buffer[0] += value.x;
-  buffer[1] += value.y;
-  buffer[2] += value.z;
+  return make_float3(buffer[0], buffer[1], buffer[2]);
 }
-#  endif
 
-ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float3_unaligned(buffer, value);
-  kernel_write_pass_float3_unaligned(buffer + 3, value * value);
+  return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
 }
-#endif /* __DENOISING_FEATURES__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
deleted file mode 100644
index 145a6b6ac40..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
deleted file mode 100644
index 012daba62d8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
deleted file mode 100644
index 16351a7f949..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
deleted file mode 100644
index 1423b182ab8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleV,
-                                                     float *sampleVV,
-                                                     float *bufferV,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tiles,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample);
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
deleted file mode 100644
index 3d4cb87e104..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-#include "kernel/kernel_compat_cpu.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-#ifdef KERNEL_STUB
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoise filter */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleVariance,
-                                                     float *sampleVarianceV,
-                                                     float *bufferVariance,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
-#else
-  kernel_filter_divide_shadow(sample,
-                              tile_info,
-                              x,
-                              y,
-                              unfilteredA,
-                              unfilteredB,
-                              sampleVariance,
-                              sampleVarianceV,
-                              bufferVariance,
-                              load_int4(prefilter_rect),
-                              buffer_pass_stride,
-                              buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
-#else
-  kernel_filter_get_feature(sample,
-                            tile_info,
-                            m_offset,
-                            v_offset,
-                            x,
-                            y,
-                            mean,
-                            variance,
-                            scale,
-                            load_int4(prefilter_rect),
-                            buffer_pass_stride,
-                            buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
-#else
-  kernel_filter_write_feature(
-      sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
-#else
-  kernel_filter_detect_outliers(
-      x, y, image, variance, depth, output, load_int4(rect), pass_stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
-#else
-  kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tile_info,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *prefilter_rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
-#else
-  rank += storage_ofs;
-  transform += storage_ofs * TRANSFORM_SIZE;
-  kernel_filter_construct_transform(buffer,
-                                    tile_info,
-                                    x,
-                                    y,
-                                    load_int4(prefilter_rect),
-                                    pass_stride,
-                                    frame_stride,
-                                    use_time,
-                                    transform,
-                                    rank,
-                                    radius,
-                                    pca_threshold);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
-#else
-  kernel_filter_nlm_calc_difference(dx,
-                                    dy,
-                                    weight_image,
-                                    variance_image,
-                                    scale_image,
-                                    difference_image,
-                                    load_int4(rect),
-                                    stride,
-                                    channel_offset,
-                                    frame_offset,
-                                    a,
-                                    k_2);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
-#else
-  kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
-#else
-  kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
-#else
-  kernel_filter_nlm_update_output(dx,
-                                  dy,
-                                  difference_image,
-                                  image,
-                                  temp_image,
-                                  out_image,
-                                  accum_image,
-                                  load_int4(rect),
-                                  channel_offset,
-                                  stride,
-                                  f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
-#else
-  kernel_filter_nlm_construct_gramian(dx,
-                                      dy,
-                                      t,
-                                      difference_image,
-                                      buffer,
-                                      transform,
-                                      rank,
-                                      XtWX,
-                                      XtWY,
-                                      load_int4(rect),
-                                      load_int4(filter_window),
-                                      stride,
-                                      f,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
-#else
-  kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_finalize);
-#else
-  XtWX += storage_ofs * XTWX_SIZE;
-  XtWY += storage_ofs * XTWY_SIZE;
-  rank += storage_ofs;
-  kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
-#endif
-}
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
deleted file mode 100644
index 75833d83648..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
deleted file mode 100644
index c998cd54d3a..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
deleted file mode 100644
index fc4ef1fca5b..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
deleted file mode 100644
index ea3103f12c3..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample);
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-/* Split kernels */
-
-void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
-                                          ccl_constant KernelData *data,
-                                          ccl_global void *split_data_buffer,
-                                          int num_elements,
-                                          ccl_global char *ray_state,
-                                          int start_sample,
-                                          int end_sample,
-                                          int sx,
-                                          int sy,
-                                          int sw,
-                                          int sh,
-                                          int offset,
-                                          int stride,
-                                          ccl_global int *Queue_index,
-                                          int queuesize,
-                                          ccl_global char *use_queues_flag,
-                                          ccl_global unsigned int *work_pool_wgs,
-                                          unsigned int num_samples,
-                                          ccl_global float *buffer);
-
-#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data);
-
-DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
-DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
-DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
-DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
deleted file mode 100644
index 51d6c23f72f..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-
-#ifndef KERNEL_STUB
-#  ifndef __SPLIT_KERNEL__
-#    include "kernel/kernel_math.h"
-#    include "kernel/kernel_types.h"
-
-#    include "kernel/split/kernel_split_data.h"
-#    include "kernel/kernel_globals.h"
-
-#    include "kernel/kernel_color.h"
-#    include "kernel/kernels/cpu/kernel_cpu_image.h"
-#    include "kernel/kernel_film.h"
-#    include "kernel/kernel_path.h"
-#    include "kernel/kernel_path_branched.h"
-#    include "kernel/kernel_bake.h"
-#  else
-#    include "kernel/split/kernel_split_common.h"
-
-#    include "kernel/split/kernel_data_init.h"
-#    include "kernel/split/kernel_path_init.h"
-#    include "kernel/split/kernel_scene_intersect.h"
-#    include "kernel/split/kernel_lamp_emission.h"
-#    include "kernel/split/kernel_do_volume.h"
-#    include "kernel/split/kernel_queue_enqueue.h"
-#    include "kernel/split/kernel_indirect_background.h"
-#    include "kernel/split/kernel_shader_setup.h"
-#    include "kernel/split/kernel_shader_sort.h"
-#    include "kernel/split/kernel_shader_eval.h"
-#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#    include "kernel/split/kernel_subsurface_scatter.h"
-#    include "kernel/split/kernel_direct_lighting.h"
-#    include "kernel/split/kernel_shadow_blocked_ao.h"
-#    include "kernel/split/kernel_shadow_blocked_dl.h"
-#    include "kernel/split/kernel_enqueue_inactive.h"
-#    include "kernel/split/kernel_next_iteration_setup.h"
-#    include "kernel/split/kernel_indirect_subsurface.h"
-#    include "kernel/split/kernel_buffer_update.h"
-#    include "kernel/split/kernel_adaptive_stopping.h"
-#    include "kernel/split/kernel_adaptive_filter_x.h"
-#    include "kernel/split/kernel_adaptive_filter_y.h"
-#    include "kernel/split/kernel_adaptive_adjust_samples.h"
-#  endif /* __SPLIT_KERNEL__ */
-#else
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-
-#  ifdef __SPLIT_KERNEL__
-#    include "kernel/split/kernel_data_init.h"
-#  endif /* __SPLIT_KERNEL__ */
-#endif   /* KERNEL_STUB */
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __SPLIT_KERNEL__
-
-/* Path Tracing */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, path_trace);
-#  else
-#    ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched) {
-    kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-  else
-#    endif
-  {
-    kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-/* Film */
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
-#  else
-  kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
-#  else
-  kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-/* Bake */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, bake);
-#  else
-#    ifdef __BAKING__
-  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#    endif
-#  endif /* KERNEL_STUB */
-}
-
-/* Shader Evaluate */
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, shader);
-#  else
-  if (type == SHADER_EVAL_DISPLACE) {
-    kernel_displace_evaluate(kg, input, output, i);
-  }
-  else {
-    kernel_background_evaluate(kg, input, output, i);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-#else /* __SPLIT_KERNEL__ */
-
-/* Split Kernel Path Tracing */
-
-#  ifdef KERNEL_STUB
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-#  else
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        kernel_##name(kg); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        ccl_local type locals; \
-        kernel_##name(kg, &locals); \
-      }
-#  endif /* KERNEL_STUB */
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
-                                    BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-#endif   /* __SPLIT_KERNEL__ */
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
deleted file mode 100644
index 989f5e5aaa8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-#define __SPLIT_KERNEL__
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
deleted file mode 100644
index 40e485d27c0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
deleted file mode 100644
index 8c44238470e..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
deleted file mode 100644
index 7a3f218d5fc..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
deleted file mode 100644
index 1cab59e0ea0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
deleted file mode 100644
index 637126d9d4c..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
-                              CCL_FILTER_TILE_INFO,
-                              int4 prefilter_rect,
-                              int buffer_pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-		int itile = ytile * 3 + xtile;
-		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
-			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
-		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
-		for (int i = 0; i < buffer_pass_stride; ++i)
-			buffer[i] = in[i];
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		if (num_inputs > 0) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3;
-			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
-			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
-			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
-		}
-		if (num_inputs > 1) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-		if (num_inputs > 2) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
-		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
-		out[0] = in[0] * num_samples;
-		out[1] = in[1] * num_samples;
-		out[2] = in[2] * num_samples;
-	}
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
-                                 CCL_FILTER_TILE_INFO,
-                                 float *unfilteredA,
-                                 float *unfilteredB,
-                                 float *sampleVariance,
-                                 float *sampleVarianceV,
-                                 float *bufferVariance,
-                                 int4 prefilter_rect,
-                                 int buffer_pass_stride,
-                                 int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            tile_info,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
-                               CCL_FILTER_TILE_INFO,
-                               int m_offset,
-                               int v_offset,
-                               float *mean,
-                               float *variance,
-                               float scale,
-                               int4 prefilter_rect,
-                               int buffer_pass_stride,
-                               int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          tile_info,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
-                                 int4 buffer_params,
-                                 int4 filter_area,
-                                 float *from,
-                                 float *buffer,
-                                 int out_offset,
-                                 int4 prefilter_rect)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-	                                x + filter_area.x,
-	                                y + filter_area.y,
-	                                buffer_params,
-	                                from,
-	                                buffer,
-	                                out_offset,
-	                                prefilter_rect);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
-                                   float *variance,
-                                   float *depth,
-                                   float *output,
-                                   int4 prefilter_rect,
-                                   int pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
-                                       CCL_FILTER_TILE_INFO,
-                                       float *transform, int *rank,
-                                       int4 filter_area, int4 rect,
-                                       int radius, float pca_threshold,
-                                       int pass_stride, int frame_stride,
-                                       bool use_time)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int *l_rank = rank + y*filter_area.z + x;
-		float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  tile_info,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
-                                       const float *ccl_restrict variance_image,
-                                       const float *ccl_restrict scale_image,
-                                       float *difference_image,
-                                       int w,
-                                       int h,
-                                       int stride,
-                                       int pass_stride,
-                                       int r,
-                                       int channel_offset,
-                                       int frame_offset,
-                                       float a,
-                                       float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
-                            float *out_image,
-                            int w,
-                            int h,
-                            int stride,
-                            int pass_stride,
-                            int r,
-                            int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                   float *out_image,
-                                   int w,
-                                   int h,
-                                   int stride,
-                                   int pass_stride,
-                                   int r,
-                                   int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
-                                     const float *ccl_restrict image,
-                                     float *out_image,
-                                     float *accum_image,
-                                     int w,
-                                     int h,
-                                     int stride,
-                                     int pass_stride,
-                                     int channel_offset,
-                                     int r,
-                                     int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
-                                 const float *ccl_restrict accum_image,
-                                 int w,
-                                 int h,
-                                 int stride)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
-                                         const float *ccl_restrict difference_image,
-                                         const float *ccl_restrict buffer,
-                                         float const* __restrict__ transform,
-                                         int *rank,
-                                         float *XtWX,
-                                         float3 *XtWY,
-                                         int4 filter_window,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f,
-                                         int frame_offset,
-                                         bool use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
-                            int *rank,
-                            float *XtWX,
-                            float3 *XtWY,
-                            int4 filter_area,
-                            int4 buffer_params,
-                            int sample)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
deleted file mode 100644
index cf62b6e781e..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/cuda/kernel_cuda_image.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-	
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	KernelGlobals kg;
-	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
-		uint x = tile->x + work_index % tile->w;
-		uint y = tile->y + work_index / tile->w;
-		int index = tile->offset + x + y * tile->stride;
-		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-		kernel_do_adaptive_stopping(&kg, buffer, sample);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->h) {
-			int y = tile->y + ccl_global_id(0);
-			kernel_do_adaptive_filter_x(&kg, y, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->w) {
-			int x = tile->x + ccl_global_id(0);
-			kernel_do_adaptive_filter_y(&kg, x, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
-{
-	if(kernel_data.film.pass_adaptive_aux_buffer) {
-		int work_index = ccl_global_id(0);
-		bool thread_is_active = work_index < total_work_size;
-		KernelGlobals kg;
-		if(thread_is_active) {
-			uint x = tile->x + work_index % tile->w;
-			uint y = tile->y + work_index / tile->w;
-			int index = tile->offset + x + y * tile->stride;
-			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-				float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-				if(sample_multiplier != 1.0f) {
-					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
-				}
-			}
-			else {
-				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
-			}
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_displace(uint4 *input,
-                     float4 *output,
-                     int type,
-                     int sx,
-                     int sw,
-                     int offset,
-                     int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_displace_evaluate(&kg, input, output, x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_background(uint4 *input,
-                       float4 *output,
-                       int type,
-                       int sx,
-                       int sw,
-                       int offset,
-                       int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_background_evaluate(&kg, input, output, x);
-	}
-}
-
-#ifdef __BAKING__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-}
-#endif
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
deleted file mode 100644
index 2e47ce2de6c..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* device data taken from CUDA occupancy calculator */
-
-/* 3.0 and 3.5 */
-#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.x, 6.x */
-#elif __CUDA_ARCH__ <= 699
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
- * registers */
-#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
-#    define CUDA_KERNEL_MAX_REGISTERS 64
-#  else
-#    define CUDA_KERNEL_MAX_REGISTERS 48
-#  endif
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 7.x, 8.x */
-#elif __CUDA_ARCH__ <= 899
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 64
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* For split kernel using all registers seems fastest for now, but this
- * is unlikely to be optimal once we resolve other bottlenecks. */
-
-#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
-
-/* Compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread. */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-  __launch_bounds__(threads_block_width *threads_block_width, \
-                    CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-                        (threads_block_width * threads_block_width * thread_num_registers))
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-        (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
-    CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
deleted file mode 100644
index 95ad7599cf1..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA split kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#define __SPLIT_KERNEL__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-#include "kernel/split/kernel_path_init.h"
-#include "kernel/split/kernel_scene_intersect.h"
-#include "kernel/split/kernel_lamp_emission.h"
-#include "kernel/split/kernel_do_volume.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-#include "kernel/split/kernel_indirect_background.h"
-#include "kernel/split/kernel_shader_setup.h"
-#include "kernel/split/kernel_shader_sort.h"
-#include "kernel/split/kernel_shader_eval.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-#include "kernel/split/kernel_direct_lighting.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-#include "kernel/split/kernel_buffer_update.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#include "kernel/kernel_film.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
-{
-	*size = split_data_buffer_size(NULL, num_threads);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace_data_init(
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,
-        int queuesize,
-        ccl_global char *use_queues_flag,
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-        ccl_global float *buffer)
-{
-	kernel_data_init(NULL,
-	                 NULL,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		kernel_##name(NULL); \
-	}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		ccl_local type locals; \
-		kernel_##name(NULL, &locals); \
-	}
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
deleted file mode 100644
index 996bc27f71b..00000000000
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL kernel entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-__kernel void kernel_ocl_filter_divide_shadow(int sample,
-                                              CCL_FILTER_TILE_INFO,
-                                              ccl_global float *unfilteredA,
-                                              ccl_global float *unfilteredB,
-                                              ccl_global float *sampleVariance,
-                                              ccl_global float *sampleVarianceV,
-                                              ccl_global float *bufferVariance,
-                                              int4 prefilter_rect,
-                                              int buffer_pass_stride,
-                                              int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            CCL_FILTER_TILE_INFO_ARG,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_get_feature(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int m_offset,
-                                            int v_offset,
-                                            ccl_global float *mean,
-                                            ccl_global float *variance,
-                                            float scale,
-                                            int4 prefilter_rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          CCL_FILTER_TILE_INFO_ARG,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_write_feature(int sample,
-                                              int4 buffer_params,
-                                              int4 filter_area,
-                                              ccl_global float *from,
-                                              ccl_global float *buffer,
-                                              int out_offset,
-                                              int4 prefilter_rect)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-		                            x + filter_area.x,
-		                            y + filter_area.y,
-		                            buffer_params,
-		                            from,
-		                            buffer,
-		                            out_offset,
-		                            prefilter_rect);
-	}
-}
-
-__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
-                                                ccl_global float *variance,
-                                                ccl_global float *depth,
-                                                ccl_global float *output,
-                                                int4 prefilter_rect,
-                                                int pass_stride)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
-                                               ccl_global float *variance,
-                                               ccl_global float *a,
-                                               ccl_global float *b,
-                                               int4 prefilter_rect,
-                                               int r)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                    CCL_FILTER_TILE_INFO,
-                                                    ccl_global float *transform,
-                                                    ccl_global int *rank,
-                                                    int4 filter_area,
-                                                    int4 rect,
-                                                    int pass_stride,
-                                                    int frame_stride,
-                                                    char use_time,
-                                                    int radius,
-                                                    float pca_threshold)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		ccl_global int *l_rank = rank + y*filter_area.z + x;
-		ccl_global float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  CCL_FILTER_TILE_INFO_ARG,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
-                                                    const ccl_global float *ccl_restrict variance_image,
-                                                    const ccl_global float *ccl_restrict scale_image,
-                                                    ccl_global float *difference_image,
-                                                    int w,
-                                                    int h,
-                                                    int stride,
-                                                    int pass_stride,
-                                                    int r,
-                                                    int channel_offset,
-                                                    int frame_offset,
-                                                    float a,
-                                                    float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
-                                         ccl_global float *out_image,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
-                                                ccl_global float *out_image,
-                                                int w,
-                                                int h,
-                                                int stride,
-                                                int pass_stride,
-                                                int r,
-                                                int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
-                                                  const ccl_global float *ccl_restrict image,
-                                                  ccl_global float *out_image,
-                                                  ccl_global float *accum_image,
-                                                  int w,
-                                                  int h,
-                                                  int stride,
-                                                  int pass_stride,
-                                                  int channel_offset,
-                                                  int r,
-                                                  int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
-                                              const ccl_global float *ccl_restrict accum_image,
-                                              int w,
-                                              int h,
-                                              int stride)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
-                                                      const ccl_global float *ccl_restrict difference_image,
-                                                      const ccl_global float *ccl_restrict buffer,
-                                                      const ccl_global float *ccl_restrict transform,
-                                                      ccl_global int *rank,
-                                                      ccl_global float *XtWX,
-                                                      ccl_global float3 *XtWY,
-                                                      int4 filter_window,
-                                                      int w,
-                                                      int h,
-                                                      int stride,
-                                                      int pass_stride,
-                                                      int r,
-                                                      int f,
-                                                      int frame_offset,
-                                                      char use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
-                                         ccl_global int *rank,
-                                         ccl_global float *XtWX,
-                                         ccl_global float3 *XtWY,
-                                         int4 filter_area,
-                                         int4 buffer_params,
-                                         int sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
deleted file mode 100644
index ebdb99d4730..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#define KERNEL_NAME adaptive_adjust_samples
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
deleted file mode 100644
index 76d82d4184e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-
-#define KERNEL_NAME adaptive_filter_x
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
deleted file mode 100644
index 1e6d15ba0f2..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-
-#define KERNEL_NAME adaptive_filter_y
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
deleted file mode 100644
index 51de0059667..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-
-#define KERNEL_NAME adaptive_stopping
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl
deleted file mode 100644
index 0e600676e82..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_background(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_background_evaluate(kg, input, output, x);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
deleted file mode 100644
index 7b81e387467..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_bake(
-	ccl_constant KernelData *data,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int sx, int sy, int sw, int sh, int offset, int stride, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh) {
-#ifndef __NO_BAKING__
-		kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#endif
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl
deleted file mode 100644
index 1c2d89e8a92..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_base.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL base kernels entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-
-#include "kernel/kernel_film.h"
-
-
-__kernel void kernel_ocl_convert_to_byte(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_convert_to_half_float(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
-{
-	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-	if(i < size / sizeof(float4)) {
-		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-	else if(i == size / sizeof(float4)) {
-		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
-
-		for(i = 0; i < size % sizeof(float4); i++) {
-			*(b++) = 0;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
deleted file mode 100644
index 7125348a49f..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-
-__kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-		KERNEL_BUFFER_PARAMS,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-        int queuesize,                               /* size (capacity) of the queue */
-        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-        ccl_global float *buffer)
-{
-	kernel_data_init((KernelGlobals*)kg,
-	                 data,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 KERNEL_BUFFER_ARGS,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
deleted file mode 100644
index 76cc36971f5..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_displace(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_displace_evaluate(kg, input, output, x);
-	}
-}
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
deleted file mode 100644
index 8b1332bf013..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-
-#define KERNEL_NAME next_iteration_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
deleted file mode 100644
index bb6b8a40e8e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_NANOVDB
-/* Data type to replace `double` used in the NanoVDB headers. Cycles don't need doubles, and is
- * safer and more portable to never use double datatype on GPU.
- * Use a special structure, so that the following is true:
- * - No unnoticed implicit cast or mathematical operations used on scalar 64bit type
- *   (which rules out trick like using `uint64_t` as a drop-in replacement for double).
- * - Padding rules are matching exactly `double`
- *   (which rules out array of `uint8_t`). */
-typedef struct ccl_vdb_double_t {
-  uint64_t i;
-} ccl_vdb_double_t;
-
-#  define double ccl_vdb_double_t
-#  include "nanovdb/CNanoVDB.h"
-#  undef double
-#endif
-
-/* For OpenCL we do manual lookup and interpolation. */
-
-ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id)
-{
-  const uint tex_offset = id
-#define KERNEL_TEX(type, name) +1
-#include "kernel/kernel_textures.h"
-      ;
-
-  return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset];
-}
-
-#define tex_fetch(type, info, index) \
-  ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-  x %= width;
-  if (x < 0)
-    x += width;
-  return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-  return clamp(x, 0, width - 1);
-}
-
-ccl_device_inline float4 svm_image_texture_read(
-    KernelGlobals *kg, const ccl_global TextureInfo *info, void *acc, int x, int y, int z)
-{
-  const int data_offset = x + info->width * y + info->width * info->height * z;
-  const int texture_type = info->data_type;
-
-  /* Float4 */
-  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
-    return tex_fetch(float4, info, data_offset);
-  }
-  /* Byte4 */
-  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
-    uchar4 r = tex_fetch(uchar4, info, data_offset);
-    float f = 1.0f / 255.0f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Ushort4 */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
-    ushort4 r = tex_fetch(ushort4, info, data_offset);
-    float f = 1.0f / 65535.f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Float */
-  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
-    float f = tex_fetch(float, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* UShort */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
-    ushort r = tex_fetch(ushort, info, data_offset);
-    float f = r * (1.0f / 65535.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-#ifdef WITH_NANOVDB
-  /* NanoVDB Float */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    float f = cnanovdb_readaccessor_getValueF((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* NanoVDB Float3 */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    cnanovdb_Vec3F f = cnanovdb_readaccessor_getValueF3((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f.mVec[0], f.mVec[1], f.mVec[2], 1.0f);
-  }
-#endif
-#ifdef __KERNEL_CL_KHR_FP16__
-  /* Half and Half4 are optional in OpenCL */
-  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
-    float f = tex_fetch(half, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
-    half4 r = tex_fetch(half4, info, data_offset);
-    return make_float4(r.x, r.y, r.z, r.w);
-  }
-#endif
-  /* Byte */
-  else {
-    uchar r = tex_fetch(uchar, info, data_offset);
-    float f = r * (1.0f / 255.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-}
-
-ccl_device_inline float4
-svm_image_texture_read_2d(KernelGlobals *kg, int id, void *acc, int x, int y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, 0);
-}
-
-ccl_device_inline float4
-svm_image_texture_read_3d(KernelGlobals *kg, int id, void *acc, int x, int y, int z)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-      z = svm_image_texture_wrap_periodic(z, info->depth);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-      z = svm_image_texture_wrap_clamp(z, info->depth);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, z);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
-  *ix = i;
-  return x - (float)i;
-}
-
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
-  { \
-    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
-    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
-    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
-    u[3] = (1.0f / 6.0f) * t * t * t; \
-  } \
-  (void)0
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->extension == EXTENSION_CLIP) {
-    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    }
-  }
-
-  if (info->interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy;
-    svm_image_texture_frac(x * info->width, &ix);
-    svm_image_texture_frac(y * info->height, &iy);
-
-    return svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-  }
-  else if (info->interpolation == INTERPOLATION_LINEAR) {
-    /* Bilinear interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float4 r;
-    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-    r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy);
-    r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy + 1);
-    r += ty * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy + 1);
-    return r;
-  }
-  else {
-    /* Bicubic interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float u[4], v[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int y = 0; y < 4; y++) {
-      for (int x = 0; x < 4; x++) {
-        float weight = u[x] * v[y];
-        r += weight * svm_image_texture_read_2d(kg, id, NULL, ix + x - 1, iy + y - 1);
-      }
-    }
-    return r;
-  }
-}
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->use_transform_3d) {
-    Transform tfm = info->transform_3d;
-    P = transform_point(&tfm, P);
-  }
-
-  float x = P.x;
-  float y = P.y;
-  float z = P.z;
-
-  uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp;
-
-#ifdef WITH_NANOVDB
-  cnanovdb_readaccessor acc;
-  if (info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
-      info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    ccl_global cnanovdb_griddata *grid =
-        (ccl_global cnanovdb_griddata *)(kg->buffers[info->cl_buffer] + info->data);
-    cnanovdb_readaccessor_init(&acc, cnanovdb_treedata_rootF(cnanovdb_griddata_tree(grid)));
-  }
-  else {
-    if (info->extension == EXTENSION_CLIP) {
-      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
-        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-      }
-    }
-
-    x *= info->width;
-    y *= info->height;
-    z *= info->depth;
-  }
-#  define NANOVDB_ACCESS_POINTER &acc
-#else
-#  define NANOVDB_ACCESS_POINTER NULL
-#endif
-
-  if (interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy, iz;
-    svm_image_texture_frac(x, &ix);
-    svm_image_texture_frac(y, &iy);
-    svm_image_texture_frac(z, &iz);
-
-    return svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-  }
-  else if (interpolation == INTERPOLATION_LINEAR) {
-    /* Trilinear interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float4 r;
-    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
-        svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-    r += (1.0f - tz) * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz);
-    r += (1.0f - tz) * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz);
-    r += (1.0f - tz) * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz);
-
-    r += tz * (1.0f - ty) * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz + 1);
-    r += tz * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz + 1);
-    r += tz * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz + 1);
-    r += tz * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz + 1);
-    return r;
-  }
-  else {
-    /* Tricubic interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float u[4], v[4], w[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int z = 0; z < 4; z++) {
-      for (int y = 0; y < 4; y++) {
-        for (int x = 0; x < 4; x++) {
-          float weight = u[x] * v[y] * w[z];
-          r += weight * svm_image_texture_read_3d(
-                            kg, id, NANOVDB_ACCESS_POINTER, ix + x - 1, iy + y - 1, iz + z - 1);
-        }
-      }
-    }
-    return r;
-  }
-#undef NANOVDB_ACCESS_POINTER
-}
-
-#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
deleted file mode 100644
index 68ee6f1d536..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-
-#define KERNEL_NAME queue_enqueue
-#define LOCALS_TYPE QueueEnqueueLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
deleted file mode 100644
index 10d09377ba9..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_scene_intersect.h"
-
-#define KERNEL_NAME scene_intersect
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
deleted file mode 100644
index 40eaa561863..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_eval.h"
-
-#define KERNEL_NAME shader_eval
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
deleted file mode 100644
index 8c36100f762..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_setup.h"
-
-#define KERNEL_NAME shader_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
deleted file mode 100644
index bcacaa4a054..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_sort.h"
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-#define KERNEL_NAME shader_sort
-#define LOCALS_TYPE ShaderSortLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
deleted file mode 100644
index 8de250a375c..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-
-#define KERNEL_NAME shadow_blocked_ao
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
deleted file mode 100644
index 29da77022ed..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-
-#define KERNEL_NAME shadow_blocked_dl
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
deleted file mode 100644
index c3b7b09460a..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
-#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
-
-#include "kernel/kernels/opencl/kernel_data_init.cl"
-#include "kernel/kernels/opencl/kernel_path_init.cl"
-#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
-#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
-#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
-#include "kernel/kernels/opencl/kernel_shader_setup.cl"
-#include "kernel/kernels/opencl/kernel_shader_sort.cl"
-#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
-#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
-#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
-#include "kernel/kernels/opencl/kernel_buffer_update.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
deleted file mode 100644
index e123b4cd6ec..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define KERNEL_NAME_JOIN(a, b) a##_##b
-#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
-
-__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace,
-                               KERNEL_NAME)(ccl_global char *kg_global,
-                                            ccl_constant KernelData *data,
-
-                                            ccl_global void *split_data_buffer,
-                                            ccl_global char *ray_state,
-
-                                            KERNEL_BUFFER_PARAMS,
-
-                                            ccl_global int *queue_index,
-                                            ccl_global char *use_queues_flag,
-                                            ccl_global unsigned int *work_pools,
-                                            ccl_global float *buffer)
-{
-#ifdef LOCALS_TYPE
-  ccl_local LOCALS_TYPE locals;
-#endif
-
-  KernelGlobals *kg = (KernelGlobals *)kg_global;
-
-  if (ccl_local_id(0) + ccl_local_id(1) == 0) {
-    kg->data = data;
-
-    kernel_split_params.queue_index = queue_index;
-    kernel_split_params.use_queues_flag = use_queues_flag;
-    kernel_split_params.work_pools = work_pools;
-    kernel_split_params.tile.buffer = buffer;
-
-    split_data_init(kg,
-                    &kernel_split_state,
-                    ccl_global_size(0) * ccl_global_size(1),
-                    split_data_buffer,
-                    ray_state);
-  }
-
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-
-  KERNEL_NAME_EVAL(kernel, KERNEL_NAME)
-  (kg
-#ifdef LOCALS_TYPE
-   ,
-   &locals
-#endif
-  );
-}
-
-#undef KERNEL_NAME_JOIN
-#undef KERNEL_NAME_EVAL
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
deleted file mode 100644
index 2b3be38df84..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-
-#define KERNEL_NAME subsurface_scatter
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 3f9de5ab33d..8e497986dcc 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
 // clang-format on
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 76a2e41abfa..a2f9d3f759a 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index b78dc8a3a67..812c3b6e71b 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index d656723bac2..80dfbee879e 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index c5ca8616fbd..5d968ed85e0 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,7 +32,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
@@ -50,45 +50,30 @@ CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
 
-static ustring u_cubic("cubic");
-static ustring u_gaussian("gaussian");
-static ustring u_burley("burley");
-static ustring u_principled("principled");
+static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
 static ustring u_random_walk("random_walk");
-static ustring u_principled_random_walk("principled_random_walk");
 
 class CBSSRDFClosure : public CClosurePrimitive {
  public:
   Bssrdf params;
+  float ior;
   ustring method;
 
   CBSSRDFClosure()
   {
-    params.texture_blur = 0.0f;
-    params.sharpness = 0.0f;
-    params.roughness = 0.0f;
+    params.roughness = FLT_MAX;
+    params.anisotropy = 1.0f;
+    ior = 1.4f;
   }
 
   void setup(ShaderData *sd, int path_flag, float3 weight)
   {
-    if (method == u_cubic) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
-    }
-    else if (method == u_gaussian) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
-    }
-    else if (method == u_burley) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
-    }
-    else if (method == u_principled) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+    if (method == u_random_walk_fixed_radius) {
+      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
     }
     else if (method == u_random_walk) {
       alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
     }
-    else if (method == u_principled_random_walk) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
-    }
   }
 
   void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
@@ -106,11 +91,10 @@ class CBSSRDFClosure : public CClosurePrimitive {
       /* create one closure per color channel */
       bssrdf->radius = params.radius;
       bssrdf->albedo = params.albedo;
-      bssrdf->texture_blur = params.texture_blur;
-      bssrdf->sharpness = params.sharpness;
       bssrdf->N = params.N;
       bssrdf->roughness = params.roughness;
-      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+      bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f);
+      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f));
     }
   }
 };
@@ -122,9 +106,9 @@ ClosureParam *closure_bssrdf_params()
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"),
       CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"),
       CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
       CLOSURE_FINISH_PARAM(CBSSRDFClosure)};
   return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 7ee467a46dd..e814fcca246 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -40,10 +40,10 @@
 #include "util/util_param.h"
 
 // clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_types.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_random.h"
 
@@ -500,7 +500,7 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 {
   /* caustic options */
   if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
 
     if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
         (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 2b7c21d0bc4..396f42080e4 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -40,22 +40,22 @@
 #include "util/util_string.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_projection.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/image.h"
+
 #include "kernel/kernel_differential.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernels/cpu/kernel_cpu_image.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+
 #include "kernel/geom/geom.h"
 #include "kernel/bvh/bvh.h"
 
+#include "kernel/kernel_color.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
 #include "kernel/kernel_projection.h"
-#include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 // clang-format on
 
@@ -147,7 +147,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -155,18 +155,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
       Transform tfm;
 
       if (time == sd->time)
-        tfm = sd->ob_tfm;
+        tfm = object_get_transform(kg, sd);
       else
         tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
 #else
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+      const Transform tfm = object_get_transform(kg, sd);
 #endif
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -184,7 +185,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -192,18 +193,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
       Transform itfm;
 
       if (time == sd->time)
-        itfm = sd->ob_itfm;
+        itfm = object_get_inverse_transform(kg, sd);
       else
         object_fetch_transform_motion_test(kg, object, time, &itfm);
 #else
-      Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+      const Transform itfm = object_get_inverse_transform(kg, sd);
 #endif
       copy_matrix(result, itfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -218,7 +220,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -250,7 +252,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -284,21 +286,18 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_tfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#endif
+      const Transform tfm = object_get_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -315,21 +314,18 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_itfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-#endif
+      const Transform tfm = object_get_inverse_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -341,7 +337,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -368,7 +364,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            ustring to)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -747,7 +743,7 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val)
   return false;
 }
 
-static bool get_primitive_attribute(KernelGlobals *kg,
+static bool get_primitive_attribute(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     const OSLGlobals::Attribute &attr,
                                     const TypeDesc &type,
@@ -808,7 +804,7 @@ static bool get_primitive_attribute(KernelGlobals *kg,
   }
 }
 
-static bool get_mesh_attribute(KernelGlobals *kg,
+static bool get_mesh_attribute(const KernelGlobals *kg,
                                const ShaderData *sd,
                                const OSLGlobals::Attribute &attr,
                                const TypeDesc &type,
@@ -857,8 +853,12 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
   }
 }
 
-bool OSLRenderServices::get_object_standard_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_object_standard_attribute(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      ustring name,
+                                                      TypeDesc type,
+                                                      bool derivatives,
+                                                      void *val)
 {
   /* todo: turn this into hash table? */
 
@@ -988,8 +988,12 @@ bool OSLRenderServices::get_object_standard_attribute(
     return false;
 }
 
-bool OSLRenderServices::get_background_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_background_attribute(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 ustring name,
+                                                 TypeDesc type,
+                                                 bool derivatives,
+                                                 void *val)
 {
   if (name == u_path_ray_length) {
     /* Ray Length */
@@ -998,38 +1002,32 @@ bool OSLRenderServices::get_background_attribute(
   }
   else if (name == u_path_ray_depth) {
     /* Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_diffuse_depth) {
     /* Diffuse Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->diffuse_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.diffuse_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_glossy_depth) {
     /* Glossy Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->glossy_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.glossy_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transmission_depth) {
     /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transmission_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transparent_depth) {
     /* Transparent Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transparent_bounce;
-    return set_attribute_int(f, type, derivatives, val);
-  }
-  else if (name == u_path_transmission_depth) {
-    /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transparent_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_ndc) {
@@ -1043,8 +1041,10 @@ bool OSLRenderServices::get_background_attribute(
       ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
 
       if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx) - ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy) - ndc[0];
+        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
+                 ndc[0];
+        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
+                 ndc[0];
       }
     }
     else {
@@ -1079,7 +1079,7 @@ bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_attribute(
     ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val)
 {
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
   int prim_type = 0;
   int object;
 
@@ -1208,17 +1208,17 @@ bool OSLRenderServices::texture(ustring filename,
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
   OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kernel_globals = sd->osl_globals;
+  const KernelGlobals *kernel_globals = sd->osl_globals;
   bool status = false;
 
   switch (texture_type) {
     case OSLTextureHandle::BEVEL: {
       /* Bevel shader hack. */
       if (nchannels >= 3) {
-        PathState *state = sd->osl_path_state;
+        const IntegratorStateCPU *state = sd->osl_path_state;
         int num_samples = (int)s;
         float radius = t;
-        float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples);
+        float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
         result[0] = N.x;
         result[1] = N.y;
         result[2] = N.z;
@@ -1228,7 +1228,7 @@ bool OSLRenderServices::texture(ustring filename,
     }
     case OSLTextureHandle::AO: {
       /* AO shader hack. */
-      PathState *state = sd->osl_path_state;
+      const IntegratorStateCPU *state = sd->osl_path_state;
       int num_samples = (int)s;
       float radius = t;
       float3 N = make_float3(dsdx, dtdx, dsdy);
@@ -1242,7 +1242,7 @@ bool OSLRenderServices::texture(ustring filename,
       if ((int)options.tblur) {
         flags |= NODE_AO_GLOBAL_RADIUS;
       }
-      result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags);
+      result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
       status = true;
       break;
     }
@@ -1355,7 +1355,7 @@ bool OSLRenderServices::texture3d(ustring filename,
     case OSLTextureHandle::SVM: {
       /* Packed texture. */
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       int slot = handle->svm_slot;
       float3 P_float3 = make_float3(P.x, P.y, P.z);
       float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
@@ -1377,7 +1377,7 @@ bool OSLRenderServices::texture3d(ustring filename,
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == NULL) {
           ShaderData *sd = (ShaderData *)(sg->renderstate);
-          KernelGlobals *kernel_globals = sd->osl_globals;
+          const KernelGlobals *kernel_globals = sd->osl_globals;
           OSLThreadData *tdata = kernel_globals->osl_tdata;
           texture_thread_info = tdata->oiio_thread_info;
         }
@@ -1462,7 +1462,7 @@ bool OSLRenderServices::environment(ustring filename,
   if (handle && handle->oiio_handle) {
     if (thread_info == NULL) {
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       OSLThreadData *tdata = kernel_globals->osl_tdata;
       thread_info = tdata->oiio_thread_info;
     }
@@ -1600,10 +1600,14 @@ bool OSLRenderServices::trace(TraceOpt &options,
   }
 
   /* ray differentials */
-  ray.dP.dx = TO_FLOAT3(dPdx);
-  ray.dP.dy = TO_FLOAT3(dPdy);
-  ray.dD.dx = TO_FLOAT3(dRdx);
-  ray.dD.dy = TO_FLOAT3(dRdy);
+  differential3 dP;
+  dP.dx = TO_FLOAT3(dPdx);
+  dP.dy = TO_FLOAT3(dPdy);
+  ray.dP = differential_make_compact(dP);
+  differential3 dD;
+  dD.dx = TO_FLOAT3(dRdx);
+  dD.dy = TO_FLOAT3(dRdy);
+  ray.dD = differential_make_compact(dD);
 
   /* allocate trace data */
   OSLTraceData *tracedata = (OSLTraceData *)sg->tracedata;
@@ -1613,7 +1617,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
   tracedata->hit = false;
   tracedata->sd.osl_globals = sd->osl_globals;
 
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   /* Can't raytrace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1646,11 +1650,11 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
       }
       else {
         ShaderData *sd = &tracedata->sd;
-        KernelGlobals *kg = sd->osl_globals;
+        const KernelGlobals *kg = sd->osl_globals;
 
         if (!tracedata->setup) {
           /* lazy shader data setup */
-          shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray);
+          shader_setup_from_ray(kg, sd, &tracedata->ray, &tracedata->isect);
           tracedata->setup = true;
         }
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 891b9172dd4..58accb46e7d 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -250,10 +250,18 @@ class OSLRenderServices : public OSL::RendererServices {
                         void *data) override;
 #endif
 
-  static bool get_background_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
-  static bool get_object_standard_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
+  static bool get_background_attribute(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       ustring name,
+                                       TypeDesc type,
+                                       bool derivatives,
+                                       void *val);
+  static bool get_object_standard_attribute(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            ustring name,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            void *val);
 
   static ustring u_distance;
   static ustring u_index;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 389c854c495..880ef635c76 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -17,14 +17,16 @@
 #include <OSL/oslexec.h>
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 
 #include "kernel/geom/geom_object.h"
 
+#include "kernel/integrator/integrator_state.h"
+
 #include "kernel/osl/osl_closures.h"
 #include "kernel/osl/osl_globals.h"
 #include "kernel/osl/osl_services.h"
@@ -39,9 +41,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Threads */
 
-void OSLShader::thread_init(KernelGlobals *kg,
-                            KernelGlobals *kernel_globals,
-                            OSLGlobals *osl_globals)
+void OSLShader::thread_init(KernelGlobals *kg, OSLGlobals *osl_globals)
 {
   /* no osl used? */
   if (!osl_globals->use) {
@@ -87,8 +87,11 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 /* Globals */
 
-static void shaderdata_to_shaderglobals(
-    KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, OSLThreadData *tdata)
+static void shaderdata_to_shaderglobals(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        const IntegratorStateCPU *state,
+                                        int path_flag,
+                                        OSLThreadData *tdata)
 {
   OSL::ShaderGlobals *globals = &tdata->globals;
 
@@ -171,7 +174,10 @@ static void flatten_surface_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_surface(const KernelGlobals *kg,
+                             const IntegratorStateCPU *state,
+                             ShaderData *sd,
+                             int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -276,7 +282,10 @@ static void flatten_background_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_background(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd,
+                                int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -331,7 +340,10 @@ static void flatten_volume_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_volume(const KernelGlobals *kg,
+                            const IntegratorStateCPU *state,
+                            ShaderData *sd,
+                            int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -354,7 +366,9 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
+void OSLShader::eval_displacement(const KernelGlobals *kg,
+                                  const IntegratorStateCPU *state,
+                                  ShaderData *sd)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -377,7 +391,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *
 
 /* Attributes */
 
-int OSLShader::find_attribute(KernelGlobals *kg,
+int OSLShader::find_attribute(const KernelGlobals *kg,
                               const ShaderData *sd,
                               uint id,
                               AttributeDescriptor *desc)
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index a4fa24d0a90..f1f17b141eb 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -37,6 +37,7 @@ class Scene;
 
 struct ShaderClosure;
 struct ShaderData;
+struct IntegratorStateCPU;
 struct differential3;
 struct KernelGlobals;
 
@@ -49,19 +50,28 @@ class OSLShader {
   static void register_closures(OSLShadingSystem *ss);
 
   /* per thread data */
-  static void thread_init(KernelGlobals *kg,
-                          KernelGlobals *kernel_globals,
-                          OSLGlobals *osl_globals);
+  static void thread_init(KernelGlobals *kg, OSLGlobals *osl_globals);
   static void thread_free(KernelGlobals *kg);
 
   /* eval */
-  static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
+  static void eval_surface(const KernelGlobals *kg,
+                           const IntegratorStateCPU *state,
+                           ShaderData *sd,
+                           int path_flag);
+  static void eval_background(const KernelGlobals *kg,
+                              const IntegratorStateCPU *state,
+                              ShaderData *sd,
+                              int path_flag);
+  static void eval_volume(const KernelGlobals *kg,
+                          const IntegratorStateCPU *state,
+                          ShaderData *sd,
+                          int path_flag);
+  static void eval_displacement(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd);
 
   /* attributes */
-  static int find_attribute(KernelGlobals *kg,
+  static int find_attribute(const KernelGlobals *kg,
                             const ShaderData *sd,
                             uint id,
                             AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 23949f406c7..55afb892d36 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -18,11 +18,13 @@
 #include "stdcycles.h"
 
 shader node_principled_bsdf(string distribution = "Multiscatter GGX",
-                            string subsurface_method = "burley",
+                            string subsurface_method = "random_walk",
                             color BaseColor = color(0.8, 0.8, 0.8),
                             float Subsurface = 0.0,
                             vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
                             color SubsurfaceColor = color(0.7, 0.1, 0.1),
+                            float SubsurfaceIOR = 1.4,
+                            float SubsurfaceAnisotropy = 0.0,
                             float Metallic = 0.0,
                             float Specular = 0.5,
                             float SpecularTint = 0.0,
@@ -59,22 +61,17 @@ shader node_principled_bsdf(string distribution = "Multiscatter GGX",
   if (diffuse_weight > 1e-5) {
     if (Subsurface > 1e-5) {
       color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
-      if (subsurface_method == "burley") {
-        BSDF = mixed_ss_base_color * bssrdf("principled",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            SubsurfaceColor,
-                                            "roughness",
-                                            Roughness);
-      }
-      else {
-        BSDF = mixed_ss_base_color * bssrdf("principled_random_walk",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            mixed_ss_base_color,
-                                            "roughness",
-                                            Roughness);
-      }
+
+      BSDF = mixed_ss_base_color * bssrdf(subsurface_method,
+                                          Normal,
+                                          Subsurface * SubsurfaceRadius,
+                                          mixed_ss_base_color,
+                                          "roughness",
+                                          Roughness,
+                                          "ior",
+                                          SubsurfaceIOR,
+                                          "anisotropy",
+                                          SubsurfaceAnisotropy);
     }
     else {
       BSDF = BaseColor * principled_diffuse(Normal, Roughness);
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index b1e854150ab..f55e38c54ff 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -19,27 +19,12 @@
 shader node_subsurface_scattering(color Color = 0.8,
                                   float Scale = 1.0,
                                   vector Radius = vector(0.1, 0.1, 0.1),
-                                  float TextureBlur = 0.0,
-                                  float Sharpness = 0.0,
-                                  string falloff = "cubic",
+                                  float IOR = 1.4,
+                                  float Anisotropy = 0.0,
+                                  string method = "random_walk",
                                   normal Normal = N,
                                   output closure color BSSRDF = 0)
 {
-  if (falloff == "gaussian")
-    BSSRDF = Color *
-             bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else if (falloff == "cubic")
-    BSSRDF = Color * bssrdf("cubic",
-                            Normal,
-                            Scale * Radius,
-                            Color,
-                            "texture_blur",
-                            TextureBlur,
-                            "sharpness",
-                            Sharpness);
-  else if (falloff == "burley")
-    BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else
-    BSSRDF = Color *
-             bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
+  BSSRDF = Color *
+           bssrdf(method, Normal, Scale * Radius, Color, "ior", IOR, "anisotropy", Anisotropy);
 }
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
deleted file mode 100644
index 437a5c9581b..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
-    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-      float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-      if (sample_multiplier != 1.0f) {
-        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-      }
-    }
-    else {
-      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
deleted file mode 100644
index 93f41f7ced4..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int y = kernel_split_params.tile.y + pixel_index;
-    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
deleted file mode 100644
index eca53d079ec..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index;
-    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
deleted file mode 100644
index c8eb1ebd705..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_stopping.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    kernel_do_adaptive_stopping(kg,
-                                buffer,
-                                kernel_split_params.tile.start_sample +
-                                    kernel_split_params.tile.num_samples - 1);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
deleted file mode 100644
index 45f5037d321..00000000000
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-/* sets up the various state needed to do an indirect loop */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg,
-                                                                     int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* save a copy of the state to restore later */
-#  define BRANCHED_STORE(name) branched_state->name = kernel_split_state.name[ray_index];
-
-  BRANCHED_STORE(path_state);
-  BRANCHED_STORE(throughput);
-  BRANCHED_STORE(ray);
-  BRANCHED_STORE(isect);
-  BRANCHED_STORE(ray_state);
-
-  *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(branched_state_sd, ray_index)->closure[i] =
-        kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_STORE
-
-  /* Set loop counters to initial position. */
-  branched_state->next_closure = 0;
-  branched_state->next_sample = 0;
-}
-
-/* ends an indirect loop and restores the previous state */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg,
-                                                                    int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* restore state */
-#  define BRANCHED_RESTORE(name) kernel_split_state.name[ray_index] = branched_state->name;
-
-  BRANCHED_RESTORE(path_state);
-  BRANCHED_RESTORE(throughput);
-  BRANCHED_RESTORE(ray);
-  BRANCHED_RESTORE(isect);
-  BRANCHED_RESTORE(ray_state);
-
-  *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, ray_index)->closure[i] =
-        kernel_split_sd(branched_state_sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_RESTORE
-
-  /* leave indirect loop */
-  REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
-}
-
-ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg,
-                                                                   int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
-                                       kernel_split_state.queue_data,
-                                       kernel_split_params.queue_size,
-                                       kernel_split_params.queue_index);
-
-  if (!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
-    return false;
-  }
-
-#  define SPLIT_DATA_ENTRY(type, name, num) \
-    if (num) { \
-      kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \
-    }
-  SPLIT_DATA_ENTRIES_BRANCHED_SHARED
-#  undef SPLIT_DATA_ENTRY
-
-  *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-  kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
-  kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
-  kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
-
-  path_radiance_init(kg, inactive_L);
-  path_radiance_copy_indirect(inactive_L, L);
-
-  ray_state[inactive_ray] = RAY_REGENERATED;
-  ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
-  ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
-
-  atomic_fetch_and_inc_uint32(
-      (ccl_global uint *)&kernel_split_state.branched_state[ray_index].shared_sample_count);
-
-  return true;
-}
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
-    KernelGlobals *kg,
-    int ray_index,
-    float num_samples_adjust,
-    ShaderData *saved_sd,
-    bool reset_path_state,
-    bool wait_for_shared)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = saved_sd;
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  float3 throughput = branched_state->throughput;
-  ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-
-  float sum_sample_weight = 0.0f;
-#  ifdef __DENOISING_FEATURES__
-  if (ps->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#  endif /* __DENOISING_FEATURES__ */
-
-  for (int i = branched_state->next_closure; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSDF(sc->type))
-      continue;
-    /* transparency is not handled here, but in outer loop */
-    if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-      continue;
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = branched_state->next_sample; j < num_samples; j++) {
-      if (reset_path_state) {
-        *ps = branched_state->path_state;
-      }
-
-      ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-      ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-      *tp = throughput;
-
-      ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, tp, ps, &L->state, bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps->rng_hash = branched_state->path_state.rng_hash;
-
-      /* update state for next iteration */
-      branched_state->next_closure = i;
-      branched_state->next_sample = j + 1;
-
-      /* start the indirect path */
-      *tp *= num_samples_inv;
-
-      if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-        continue;
-      }
-
-      return true;
-    }
-
-    branched_state->next_sample = 0;
-  }
-
-  branched_state->next_closure = sd->num_closure;
-
-  if (wait_for_shared) {
-    branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-    if (branched_state->waiting_on_shared_samples) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
deleted file mode 100644
index b96feca582f..00000000000
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of rays that hit the background (sceneintersect
- * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
- * accumulated radiance in the output buffer. This kernel also takes care of
- * rays that have been determined to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel.
- * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
- * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- */
-ccl_device void kernel_buffer_update(KernelGlobals *kg,
-                                     ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (ray_index == 0) {
-    /* We will empty this queue in this kernel. */
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-  char enqueue_flag = 0;
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    bool ray_was_updated = false;
-
-    if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-      ray_was_updated = true;
-      uint sample = state->sample;
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      /* accumulate result in output buffer */
-      kernel_write_result(kg, buffer, sample, L);
-
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-    }
-
-    if (kernel_data.film.cryptomatte_passes) {
-      /* Make sure no thread is writing to the buffers. */
-      ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      if (ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
-        uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
-        kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-      /* We have completed current work; So get next work */
-      ccl_global uint *work_pools = kernel_split_params.work_pools;
-      uint total_work_size = kernel_split_params.total_work_size;
-      uint work_index;
-
-      if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-        /* If work is invalid, this means no more work is available and the thread may exit */
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-      }
-
-      if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, work_index, &x, &y, &sample);
-
-        /* Store buffer offset for writing to passes. */
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-        /* Initialize random numbers and ray. */
-        uint rng_hash;
-        kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
-
-        if (ray->t != 0.0f) {
-          /* Initialize throughput, path radiance, Ray, PathState;
-           * These rays proceed with path-iteration.
-           */
-          *throughput = make_float3(1.0f, 1.0f, 1.0f);
-          path_radiance_init(kg, L);
-          path_state_init(kg,
-                          AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                          state,
-                          rng_hash,
-                          sample,
-                          ray);
-#ifdef __SUBSURFACE__
-          kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-          enqueue_flag = 1;
-        }
-        else {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-        }
-      }
-    }
-  }
-
-  /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-   * These rays will be made active during next SceneIntersectkernel.
-   */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
deleted file mode 100644
index 2f83a10316d..00000000000
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel Initializes structures needed in path-iteration kernels.
- *
- * Note on Queues:
- * All slots in queues are initialized to queue empty slot;
- * The number of elements in the queues is initialized to 0;
- */
-
-#ifndef __KERNEL_CPU__
-ccl_device void kernel_data_init(
-#else
-void KERNEL_FUNCTION_FULL_NAME(data_init)(
-#endif
-    KernelGlobals *kg,
-    ccl_constant KernelData *data,
-    ccl_global void *split_data_buffer,
-    int num_elements,
-    ccl_global char *ray_state,
-
-#ifdef __KERNEL_OPENCL__
-    KERNEL_BUFFER_PARAMS,
-#endif
-
-    int start_sample,
-    int end_sample,
-    int sx,
-    int sy,
-    int sw,
-    int sh,
-    int offset,
-    int stride,
-    ccl_global int *Queue_index,      /* Tracks the number of elements in queues */
-    int queuesize,                    /* size (capacity) of the queue */
-    ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues
-                                         to fetch ray index */
-    ccl_global unsigned int *work_pools, /* Work pool for each work group */
-    unsigned int num_samples,
-    ccl_global float *buffer)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, data_init);
-#else
-
-#  ifdef __KERNEL_OPENCL__
-  kg->data = data;
-#  endif
-
-  kernel_split_params.tile.x = sx;
-  kernel_split_params.tile.y = sy;
-  kernel_split_params.tile.w = sw;
-  kernel_split_params.tile.h = sh;
-
-  kernel_split_params.tile.start_sample = start_sample;
-  kernel_split_params.tile.num_samples = num_samples;
-
-  kernel_split_params.tile.offset = offset;
-  kernel_split_params.tile.stride = stride;
-
-  kernel_split_params.tile.buffer = buffer;
-
-  kernel_split_params.total_work_size = sw * sh * num_samples;
-
-  kernel_split_params.work_pools = work_pools;
-
-  kernel_split_params.queue_index = Queue_index;
-  kernel_split_params.queue_size = queuesize;
-  kernel_split_params.use_queues_flag = use_queues_flag;
-
-  split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
-
-#  ifdef __KERNEL_OPENCL__
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-  kernel_set_buffer_info(kg);
-#  endif
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  /* Initialize queue data and queue index. */
-  if (thread_index < queuesize) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-    }
-  }
-
-  if (thread_index == 0) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      Queue_index[i] = 0;
-    }
-
-    /* The scene-intersect kernel should not use the queues very first time.
-     * since the queue would be empty.
-     */
-    *use_queues_flag = 0;
-  }
-#endif /* KERENL_STUB */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
deleted file mode 100644
index 3be2b35812f..00000000000
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of direct lighting logic.
- * However, the "shadow ray cast" part of direct lighting is handled
- * in the next kernel.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with direct lighting should be executed. Those rays for which
- * a shadow_blocked() function for direct-lighting must be executed, are
- * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS
- *
- * Note on Queues:
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
- * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
- * the corresponding shadow_blocked part, after direct lighting, the ray is
- * marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called:
- * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
- *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
- *   kernel call.
- * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
- *   shadow_blocked function must be executed, after this kernel call
- *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
- */
-ccl_device void kernel_direct_lighting(KernelGlobals *kg,
-                                       ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  char enqueue_flag = 0;
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    /* direct lighting */
-#ifdef __EMISSION__
-    bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL));
-
-#  ifdef __BRANCHED_PATH__
-    if (flag && kernel_data.integrator.branched) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __BRANCHED_PATH__ */
-
-#  ifdef __SHADOW_TRICKS__
-    if (flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __SHADOW_TRICKS__ */
-
-    if (flag) {
-      /* Sample illumination from lights to find path contribution. */
-      float light_u, light_v;
-      path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-      float terminate = path_state_rng_light_termination(kg, state);
-
-      LightSample ls;
-      if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-        Ray light_ray;
-        light_ray.time = sd->time;
-
-        BsdfEval L_light;
-        bool is_lamp;
-        if (direct_emission(kg,
-                            sd,
-                            AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                            &ls,
-                            state,
-                            &light_ray,
-                            &L_light,
-                            &is_lamp,
-                            terminate)) {
-          /* Write intermediate data to global memory to access from
-           * the next kernel.
-           */
-          kernel_split_state.light_ray[ray_index] = light_ray;
-          kernel_split_state.bsdf_eval[ray_index] = L_light;
-          kernel_split_state.is_lamp[ray_index] = is_lamp;
-          /* Mark ray state for next shadow kernel. */
-          enqueue_flag = 1;
-        }
-      }
-    }
-#endif /* __EMISSION__ */
-  }
-
-#ifdef __EMISSION__
-  /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
-   * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
-   */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_LIGHT_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
deleted file mode 100644
index 1775e870f07..00000000000
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
-
-ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg,
-                                                                             int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg,
-                                                                               int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  /* GPU: no decoupled ray marching, scatter probabilistically. */
-  int num_samples = kernel_data.integrator.volume_samples;
-  float num_samples_inv = 1.0f / num_samples;
-
-  Ray volume_ray = branched_state->ray;
-  volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ?
-                     branched_state->isect.t :
-                     FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
-
-  for (int j = branched_state->next_sample; j < num_samples; j++) {
-    ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-    *ps = branched_state->path_state;
-
-    ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
-    *pray = branched_state->ray;
-
-    ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-    *tp = branched_state->throughput * num_samples_inv;
-
-    /* branch RNG state */
-    path_state_branch(ps, j, num_samples);
-
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, ps, sd, &volume_ray, L, tp, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
-
-      /* indirect light bounce */
-      if (!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
-        continue;
-      }
-
-      /* start the indirect path */
-      branched_state->next_closure = 0;
-      branched_state->next_sample = j + 1;
-
-      /* Attempting to share too many samples is slow for volumes as it causes us to
-       * loop here more and have many calls to kernel_volume_integrate which evaluates
-       * shaders. The many expensive shader evaluations cause the work load to become
-       * unbalanced and many threads to become idle in this kernel. Limiting the
-       * number of shared samples here helps quite a lot.
-       */
-      if (branched_state->shared_sample_count < 2) {
-        if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-          continue;
-        }
-      }
-
-      return true;
-    }
-#  endif
-  }
-
-  branched_state->next_sample = num_samples;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  /* todo: avoid this calculation using decoupled ray marching */
-  float3 throughput = kernel_split_state.throughput[ray_index];
-  kernel_volume_shadow(
-      kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
-  kernel_split_state.throughput[ray_index] = throughput;
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __VOLUME__ */
-
-ccl_device void kernel_do_volume(KernelGlobals *kg)
-{
-#ifdef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-#  ifdef __BRANCHED_PATH__
-    kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
-#  endif /* __BRANCHED_PATH__ */
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (*kernel_split_params.use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    bool hit = !IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
-
-    /* Sanitize volume stack. */
-    if (!hit) {
-      kernel_volume_clean_stack(kg, state->volume_stack);
-    }
-    /* volume attenuation, emission, scatter */
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      Ray volume_ray = *ray;
-      volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif /* __BRANCHED_PATH__ */
-        float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-        {
-          /* integrate along volume segment with distance sampling */
-          VolumeIntegrateResult result = kernel_volume_integrate(
-              kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-          if (result == VOLUME_PATH_SCATTERED) {
-            /* direct lighting */
-            kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-            /* indirect light bounce */
-            if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
-              ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-            }
-            else {
-              kernel_split_path_end(kg, ray_index);
-            }
-          }
-#  endif /* __VOLUME_SCATTER__ */
-        }
-
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif /* __BRANCHED_PATH__ */
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_VOLUME_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __VOLUME__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
deleted file mode 100644
index 745313f89f1..00000000000
--- a/intern/cycles/kernel/split/kernel_enqueue_inactive.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
-                                        ccl_local_param unsigned int *local_queue_atomics)
-{
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  char enqueue_flag = 0;
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
-    enqueue_flag = 1;
-  }
-
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_INACTIVE_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
deleted file mode 100644
index 61722840b0b..00000000000
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of the logic to process "material of type holdout",
- * indirect primitive emission, bsdf blurring, probabilistic path termination
- * and AO.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with AO should be executed. Those rays for which a
- * shadow_blocked() function for AO must be executed are marked with flag
- * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * Note on Queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
- *     flag RAY_SHADOW_RAY_CAST_AO
- */
-
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
-    KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    locals->queue_atomics_bg = 0;
-    locals->queue_atomics_ao = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-#ifdef __AO__
-  char enqueue_flag = 0;
-#endif
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global PathState *state = 0x0;
-    float3 throughput;
-
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-      ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-      PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-      throughput = kernel_split_state.throughput[ray_index];
-      state = &kernel_split_state.path_state[ray_index];
-
-      if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, buffer)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* Path termination. this is a strange place to put the termination, it's
-       * mainly due to the mixed in MIS that we use. gives too many unneeded
-       * shader evaluations, only need emission if we are going to terminate.
-       */
-      float probability = path_state_continuation_probability(kg, state, throughput);
-
-      if (probability == 0.0f) {
-        kernel_split_path_end(kg, ray_index);
-      }
-      else if (probability < 1.0f) {
-        float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-        if (terminate >= probability) {
-          kernel_split_path_end(kg, ray_index);
-        }
-        else {
-          kernel_split_state.throughput[ray_index] = throughput / probability;
-        }
-      }
-
-#ifdef __DENOISING_FEATURES__
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-        kernel_update_denoising_features(kg, sd, state, L);
-      }
-#endif
-    }
-
-#ifdef __AO__
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        enqueue_flag = 1;
-      }
-    }
-#endif /* __AO__ */
-  }
-
-#ifdef __AO__
-  /* Enqueue to-shadow-ray-cast rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          &locals->queue_atomics_ao,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
deleted file mode 100644
index 6d500650cc0..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_background(KernelGlobals *kg)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int ray_index;
-
-  if (kernel_data.integrator.ao_bounces != INT_MAX) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index != QUEUE_EMPTY_SLOT) {
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-        if (path_state_ao_bounce(kg, state)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-      }
-    }
-  }
-
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  if (IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    kernel_path_background(kg, state, ray, throughput, sd, buffer, L);
-    kernel_split_path_end(kg, ray_index);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
deleted file mode 100644
index 3f48f8d6f56..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index;
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-#ifdef __SUBSURFACE__
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect->num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, ss_indirect, state, ray, L, throughput);
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
deleted file mode 100644
index 7ecb099208d..00000000000
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
- * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- */
-ccl_device void kernel_lamp_emission(KernelGlobals *kg)
-{
-#ifndef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-  }
-#endif
-  /* Fetch use_queues_flag. */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-#ifndef __VOLUME__
-                              1
-#else
-                              0
-#endif
-    );
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
deleted file mode 100644
index 320f6a414bf..00000000000
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*This kernel takes care of setting up ray for the next iteration of
- * path-iteration and accumulating radiance corresponding to AO and
- * direct-lighting
- *
- * Ray state of rays that are terminated in this kernel are changed
- * to RAY_UPDATE_BUFFER.
- *
- * Note on queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFF state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
- */
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
-}
-
-ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index)
-{
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-
-#  ifdef __VOLUME__
-  if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-    /* continue in case of transparency */
-    *throughput *= shader_bsdf_transparency(kg, sd);
-
-    if (is_zero(*throughput)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-
-    /* Update Path State */
-    path_state_next(kg, state, LABEL_TRANSPARENT);
-#  ifdef __VOLUME__
-  }
-  else {
-    if (!path_state_volume_next(kg, state)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-  }
-#  endif
-
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->t -= sd->ray_length; /* clipping works through transparent */
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD.dx = -sd->dI.dx;
-  ray->dD.dy = -sd->dI.dy;
-#  endif /* __RAY_DIFFERENTIALS__ */
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif /* __VOLUME__ */
-}
-#endif /* __BRANCHED_PATH__ */
-
-ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
-                                            ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    /* If we are here, then it means that scene-intersect kernel
-     * has already been executed at least once. From the next time,
-     * scene-intersect kernel may operate on queues to fetch ray index
-     */
-    *kernel_split_params.use_queues_flag = 1;
-
-    /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-     * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-     * previous kernel.
-     */
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __VOLUME__
-  /* Reactivate only volume rays here, most surface work was skipped. */
-  if (IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-  }
-#endif
-
-  bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
-  if (active) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
-    if (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-      /* Compute direct lighting and next bounce. */
-      if (!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-#ifdef __BRANCHED_PATH__
-    }
-    else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-    else {
-      kernel_split_branched_indirect_light_init(kg, ray_index);
-
-      if (kernel_split_branched_path_surface_indirect_light_iter(
-              kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-      }
-      else {
-        kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-        kernel_split_branched_transparent_bounce(kg, ray_index);
-      }
-    }
-#endif /* __BRANCHED_PATH__ */
-  }
-
-  /* Enqueue RAY_UPDATE_BUFFER rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-#ifdef __BRANCHED_PATH__
-  /* iter loop */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
-  }
-
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_LIGHT_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_reset_indirect(L);
-
-    if (kernel_split_branched_path_surface_indirect_light_iter(
-            kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-    else {
-      kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-  }
-
-#  ifdef __VOLUME__
-  /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_VOLUME_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#  endif /* __VOLUME__ */
-
-#  ifdef __SUBSURFACE__
-  /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_SUBSURFACE_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-#  endif /* __SUBSURFACE__ */
-#endif   /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
deleted file mode 100644
index c686f46a0cd..00000000000
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- */
-ccl_device void kernel_path_init(KernelGlobals *kg)
-{
-  int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-  /* This is the first assignment to ray_state;
-   * So we don't use ASSIGN_RAY_STATE macro.
-   */
-  kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
-
-  /* Get work. */
-  ccl_global uint *work_pools = kernel_split_params.work_pools;
-  uint total_work_size = kernel_split_params.total_work_size;
-  uint work_index;
-
-  if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-    /* No more work, mark ray as inactive */
-    kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
-
-    return;
-  }
-
-  ccl_global WorkTile *tile = &kernel_split_params.tile;
-  uint x, y, sample;
-  get_work_pixel(tile, work_index, &x, &y, &sample);
-
-  /* Store buffer offset for writing to passes. */
-  uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-  kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-  /* Initialize random numbers and ray. */
-  uint rng_hash;
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &kernel_split_state.ray[ray_index]);
-
-  if (kernel_split_state.ray[ray_index].t != 0.0f) {
-    /* Initialize throughput, path radiance, Ray, PathState;
-     * These rays proceed with path-iteration.
-     */
-    kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-    path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]);
-    path_state_init(kg,
-                    AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                    &kernel_split_state.path_state[ray_index],
-                    rng_hash,
-                    sample,
-                    &kernel_split_state.ray[ray_index]);
-#ifdef __SUBSURFACE__
-    kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-  }
-  else {
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
deleted file mode 100644
index 2db87f7a671..00000000000
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel enqueues rays of different ray state into their
- * appropriate queues:
- *
- * 1. Rays that have been determined to hit the background from the
- *    "kernel_scene_intersect" kernel are enqueued in
- *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in pat
- *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queue during other times this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
- *     and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-ccl_device void kernel_queue_enqueue(KernelGlobals *kg, ccl_local_param QueueEnqueueLocals *locals)
-{
-  /* We have only 2 cases (Hit/Not-Hit) */
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (lidx == 0) {
-    locals->queue_atomics[0] = 0;
-    locals->queue_atomics[1] = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int queue_number = -1;
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
-    queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-  }
-  else if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-    queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-  }
-
-  unsigned int my_lqidx;
-  if (queue_number != -1) {
-    my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (lidx == 0) {
-    locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(
-        QUEUE_ACTIVE_AND_REGENERATED_RAYS, locals->queue_atomics, kernel_split_params.queue_index);
-    locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(
-        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-        locals->queue_atomics,
-        kernel_split_params.queue_index);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  unsigned int my_gqidx;
-  if (queue_number != -1) {
-    my_gqidx = get_global_queue_index(
-        queue_number, kernel_split_params.queue_size, my_lqidx, locals->queue_atomics);
-    kernel_split_state.queue_data[my_gqidx] = ray_index;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
deleted file mode 100644
index 9ac95aafd2f..00000000000
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of scene_intersect function.
- *
- * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
- * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes
- * their ray state to RAY_HIT_BACKGROUND.
- */
-ccl_device void kernel_scene_intersect(KernelGlobals *kg)
-{
-  /* Fetch use_queues_flag */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  /* All regenerated rays become active here */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-#ifdef __BRANCHED_PATH__
-    if (kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
-      kernel_split_path_end(kg, ray_index);
-    }
-    else
-#endif /* __BRANCHED_PATH__ */
-    {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-    }
-  }
-
-  if (!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    return;
-  }
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-  Intersection isect;
-  const int last_object = state->bounce > 0 ?
-                              intersection_get_object(kg, &kernel_split_state.isect[ray_index]) :
-                              OBJECT_NONE;
-  bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L, last_object);
-  kernel_split_state.isect[ray_index] = isect;
-
-  if (!hit) {
-    /* Change the state of rays that hit the background;
-     * These rays undergo special processing in the
-     * background_bufferUpdate kernel.
-     */
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
deleted file mode 100644
index c760a2b2049..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel evaluates ShaderData structure from the values computed
- * by the previous kernels.
- */
-ccl_device void kernel_shader_eval(KernelGlobals *kg)
-{
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  /* Sorting on cuda split is not implemented */
-#ifdef __KERNEL_CUDA__
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-#else
-  int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
-#endif
-  if (ray_index >= queue_index) {
-    return;
-  }
-  ray_index = get_ray_index(kg,
-                            ray_index,
-#ifdef __KERNEL_CUDA__
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-#else
-                            QUEUE_SHADER_SORTED_RAYS,
-#endif
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag);
-#ifdef __BRANCHED_PATH__
-    if (kernel_data.integrator.branched) {
-      shader_merge_closures(kernel_split_sd(sd, ray_index));
-    }
-    else
-#endif
-    {
-      shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
deleted file mode 100644
index 551836d1653..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_setup.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel sets up the ShaderData structure from the values computed
- * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- */
-ccl_device void kernel_shader_setup(KernelGlobals *kg,
-                                    ccl_local_param unsigned int *local_queue_atomics)
-{
-  /* Enqueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (ray_index < queue_index) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-  }
-  else {
-    ray_index = QUEUE_EMPTY_SLOT;
-  }
-
-  char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 :
-                                                                                               0;
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-  /* Continue on with shader evaluation. */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    Intersection isect = kernel_split_state.isect[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    shader_setup_from_ray(kg, sd, &isect, &ray);
-
-#ifdef __VOLUME__
-    if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME);
-    }
-#endif
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
deleted file mode 100644
index 95d33a42014..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local_param ShaderSortLocals *locals)
-{
-#ifndef __KERNEL_CUDA__
-  int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (tid == 0) {
-    kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
-  }
-
-  uint offset = (tid / SHADER_SORT_LOCAL_SIZE) * SHADER_SORT_BLOCK_SIZE;
-  if (offset >= qsize) {
-    return;
-  }
-
-  int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
-  uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
-  ccl_local uint *local_value = &locals->local_value[0];
-  ccl_local ushort *local_index = &locals->local_index[0];
-
-  /* copy to local memory */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint add = input + idx;
-    uint value = (~0);
-    if (idx < qsize) {
-      int ray_index = kernel_split_state.queue_data[add];
-      bool valid = (ray_index != QUEUE_EMPTY_SLOT) &&
-                   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-      if (valid) {
-        value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
-      }
-    }
-    local_value[i + lid] = value;
-    local_index[i + lid] = i + lid;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* skip sorting for cpu split kernel */
-#  ifdef __KERNEL_OPENCL__
-
-  /* bitonic sort */
-  for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-    for (uint inc = length; inc > 0; inc >>= 1) {
-      for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
-        uint i = lid + ii;
-        bool direction = ((i & (length << 1)) != 0);
-        uint j = i ^ inc;
-        ushort ioff = local_index[i];
-        ushort joff = local_index[j];
-        uint iKey = local_value[ioff];
-        uint jKey = local_value[joff];
-        bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
-        bool swap = smaller ^ (j < i) ^ direction;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-        local_index[i] = (swap) ? joff : ioff;
-        local_index[j] = (swap) ? ioff : joff;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      }
-    }
-  }
-#  endif /* __KERNEL_OPENCL__ */
-
-  /* copy to destination */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint lidx = local_index[i + lid];
-    uint outi = output + idx;
-    uint ini = input + offset + lidx;
-    uint value = local_value[lidx];
-    if (idx < qsize) {
-      kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT :
-                                                              kernel_split_state.queue_data[ini];
-    }
-  }
-#endif /* __KERNEL_CUDA__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
deleted file mode 100644
index 5d772fc597b..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for AO. */
-ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
-{
-  unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < ao_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-#ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched ||
-      IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-    kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
-#ifdef __BRANCHED_PATH__
-  }
-  else {
-    kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
-  }
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
deleted file mode 100644
index 5e46d300bca..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for direct visible light. */
-ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
-{
-  unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < dl_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-#ifdef __BRANCHED_PATH__
-  /* TODO(mai): move this somewhere else? */
-  if (thread_index == 0) {
-    /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
-    kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
-  }
-#endif /* __BRANCHED_PATH__ */
-
-  if (ray_index == QUEUE_EMPTY_SLOT)
-    return;
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.light_ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-  BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  bool is_lamp = kernel_split_state.is_lamp[ray_index];
-
-#if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
-  bool use_branched = false;
-  int all = 0;
-
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    use_branched = true;
-    all = 1;
-  }
-#  if defined(__BRANCHED_PATH__)
-  else if (kernel_data.integrator.branched) {
-    use_branched = true;
-
-    if (IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-      all = (kernel_data.integrator.sample_all_lights_indirect);
-    }
-    else {
-      all = (kernel_data.integrator.sample_all_lights_direct);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-  if (use_branched) {
-    kernel_branched_path_surface_connect_light(
-        kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-  }
-  else
-#endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
-  {
-    /* trace shadow ray */
-    float3 shadow;
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
deleted file mode 100644
index 5114f2b03e5..00000000000
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_H__
-#define __KERNEL_SPLIT_H__
-
-// clang-format off
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "kernel/kernels/opencl/kernel_opencl_image.h"
-#endif
-#ifdef __KERNEL_CUDA__
-#  include "kernel/kernels/cuda/kernel_cuda_image.h"
-#endif
-#ifdef __KERNEL_CPU__
-#  include "kernel/kernels/cpu/kernel_cpu_image.h"
-#endif
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_path.h"
-#ifdef __BRANCHED_PATH__
-#  include "kernel/kernel_path_branched.h"
-#endif
-
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
-
-#ifdef __BRANCHED_PATH__
-#  include "kernel/split/kernel_branched.h"
-#endif
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __BRANCHED_PATH__
-#  ifdef __SUBSURFACE__
-  ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-  if (ss_indirect->num_rays) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-  else
-#  endif /* __SUBSURFACE__ */
-      if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
-    int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
-
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_accum_sample(orig_ray_L, L);
-
-    atomic_fetch_and_dec_uint32(
-        (ccl_global uint *)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
-
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
-  }
-  else {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-#else
-  ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
deleted file mode 100644
index decc537b39b..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_H__
-#define __KERNEL_SPLIT_DATA_H__
-
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "kernel/kernel_globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
-{
-  (void)kg; /* Unused on CPU. */
-
-  uint64_t size = 0;
-#define SPLIT_DATA_ENTRY(type, name, num) +align_up(num_elements *num * sizeof(type), 16)
-  size = size SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  return size;
-}
-
-ccl_device_inline void split_data_init(KernelGlobals *kg,
-                                       ccl_global SplitData *split_data,
-                                       size_t num_elements,
-                                       ccl_global void *data,
-                                       ccl_global char *ray_state)
-{
-  (void)kg; /* Unused on CPU. */
-
-  ccl_global char *p = (ccl_global char *)data;
-
-#define SPLIT_DATA_ENTRY(type, name, num) \
-  split_data->name = (type *)p; \
-  p += align_up(num_elements * num * sizeof(type), 16);
-  SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  split_data->_branched_state_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  split_data->_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  split_data->ray_state = ray_state;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
deleted file mode 100644
index 06bdce9947d..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
-#define __KERNEL_SPLIT_DATA_TYPES_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* parameters used by the split kernels, we use a single struct to avoid passing these to each
- * kernel */
-
-typedef struct SplitParams {
-  WorkTile tile;
-  uint total_work_size;
-
-  ccl_global unsigned int *work_pools;
-
-  ccl_global int *queue_index;
-  int queue_size;
-  ccl_global char *use_queues_flag;
-
-  /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
-  int dummy_sd_flag;
-} SplitParams;
-
-/* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
-
-/* SPLIT_DATA_ENTRY(type, name, num) */
-
-#ifdef __BRANCHED_PATH__
-
-typedef ccl_global struct SplitBranchedState {
-  /* various state that must be kept and restored after an indirect loop */
-  PathState path_state;
-  float3 throughput;
-  Ray ray;
-
-  Intersection isect;
-
-  char ray_state;
-
-  /* indirect loop state */
-  int next_closure;
-  int next_sample;
-
-#  ifdef __SUBSURFACE__
-  int ss_next_closure;
-  int ss_next_sample;
-  int next_hit;
-  int num_hits;
-
-  uint lcg_state;
-  LocalIntersection ss_isect;
-#  endif /* __SUBSURFACE__ */
-
-  int shared_sample_count; /* number of branched samples shared with other threads */
-  int original_ray;        /* index of original ray when sharing branched samples */
-  bool waiting_on_shared_samples;
-} SplitBranchedState;
-
-#  define SPLIT_DATA_BRANCHED_ENTRIES \
-    SPLIT_DATA_ENTRY(SplitBranchedState, branched_state, 1) \
-    SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0)
-#else
-#  define SPLIT_DATA_BRANCHED_ENTRIES
-#endif /* __BRANCHED_PATH__ */
-
-#ifdef __SUBSURFACE__
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
-    SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
-#else
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
-
-#ifdef __VOLUME__
-#  define SPLIT_DATA_VOLUME_ENTRIES SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
-#else
-#  define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
-
-#define SPLIT_DATA_ENTRIES \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY( \
-      ccl_global int, queue_data, (NUM_QUEUES * 2)) /* TODO(mai): this is too large? */ \
-  SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* Entries to be copied to inactive rays when sharing branched samples
- * (TODO: which are actually needed?) */
-#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* struct that holds pointers to data in the shared state buffer */
-typedef struct SplitData {
-#define SPLIT_DATA_ENTRY(type, name, num) type *name;
-  SPLIT_DATA_ENTRIES
-#undef SPLIT_DATA_ENTRY
-
-  /* this is actually in a separate buffer from the rest of the split state data (so it can be read
-   * back from the host easily) but is still used the same as the other data so we have it here in
-   * this struct as well
-   */
-  ccl_global char *ray_state;
-} SplitData;
-
-#ifndef __KERNEL_CUDA__
-#  define kernel_split_state (kg->split_data)
-#  define kernel_split_params (kg->split_param_data)
-#else
-__device__ SplitData __split_data;
-#  define kernel_split_state (__split_data)
-__device__ SplitParams __split_param_data;
-#  define kernel_split_params (__split_param_data)
-#endif /* __KERNEL_CUDA__ */
-
-#define kernel_split_sd(sd, ray_index) \
-  ((ShaderData *)(((ccl_global char *)kernel_split_state._##sd) + \
-                  (sizeof(ShaderData) + \
-                   sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1)) * \
-                      (ray_index)))
-
-/* Local storage for queue_enqueue kernel. */
-typedef struct QueueEnqueueLocals {
-  uint queue_atomics[2];
-} QueueEnqueueLocals;
-
-/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
-typedef struct BackgroundAOLocals {
-  uint queue_atomics_bg;
-  uint queue_atomics_ao;
-} BackgroundAOLocals;
-
-typedef struct ShaderSortLocals {
-  uint local_value[SHADER_SORT_BLOCK_SIZE];
-  ushort local_index[SHADER_SORT_BLOCK_SIZE];
-} ShaderSortLocals;
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
deleted file mode 100644
index ba06ae3bc53..00000000000
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-
-ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg,
-                                                                                 int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  branched_state->ss_next_closure = 0;
-  branched_state->ss_next_sample = 0;
-
-  branched_state->num_hits = 0;
-  branched_state->next_hit = 0;
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(
-    KernelGlobals *kg, int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  for (int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* set up random number generator */
-    if (branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
-        branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-      branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
-                                                           0x68bc21eb);
-    }
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = branched_state->ss_next_sample; j < num_samples; j++) {
-      ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
-      *hit_state = branched_state->path_state;
-      hit_state->rng_hash = bssrdf_rng_hash;
-      path_state_branch(hit_state, j, num_samples);
-
-      ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_branched_rng_2D(
-          kg, bssrdf_rng_hash, hit_state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-      /* intersection is expensive so avoid doing multiple times for the same input */
-      if (branched_state->next_hit == 0 && branched_state->next_closure == 0 &&
-          branched_state->next_sample == 0) {
-        uint lcg_state = branched_state->lcg_state;
-        LocalIntersection ss_isect_private;
-
-        branched_state->num_hits = subsurface_scatter_multi_intersect(
-            kg, &ss_isect_private, sd, hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-        branched_state->lcg_state = lcg_state;
-        *ss_isect = ss_isect_private;
-      }
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-#  ifdef __VOLUME__
-      Ray volume_ray = branched_state->ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
-        ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
-        *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
-                           * important as the indirect path will write into bssrdf_sd */
-
-        LocalIntersection ss_isect_private = *ss_isect;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect_private, hit, bssrdf_sd, hit_state, bssrdf_type, bssrdf_roughness);
-        *ss_isect = ss_isect_private;
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-
-#  ifdef __EMISSION__
-        if (branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-          /* direct light */
-          if (kernel_data.integrator.use_direct_light) {
-            int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                      (hit_state->flag & PATH_RAY_SHADOW_CATCHER);
-            kernel_branched_path_surface_connect_light(kg,
-                                                       bssrdf_sd,
-                                                       emission_sd,
-                                                       hit_state,
-                                                       branched_state->throughput,
-                                                       num_samples_inv,
-                                                       L,
-                                                       all);
-          }
-        }
-#  endif /* __EMISSION__ */
-
-        /* indirect light */
-        if (kernel_split_branched_path_surface_indirect_light_iter(
-                kg, ray_index, num_samples_inv, bssrdf_sd, false, false)) {
-          branched_state->ss_next_closure = i;
-          branched_state->ss_next_sample = j;
-          branched_state->next_hit = hit;
-
-          return true;
-        }
-
-        branched_state->next_closure = 0;
-      }
-
-      branched_state->next_hit = 0;
-    }
-
-    branched_state->ss_next_sample = 0;
-  }
-
-  branched_state->ss_next_closure = sd->num_closure;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
-
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-
-#ifdef __SUBSURFACE__
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    if (sd->flag & SD_BSSRDF) {
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif
-        if (kernel_path_subsurface_scatter(
-                kg, sd, emission_sd, L, state, ray, throughput, ss_indirect)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
-  }
-
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_SUBSURFACE_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 000da1fa615..4aee1ef11b3 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -48,16 +48,18 @@ ccl_device_inline float3 stack_load_float3(float *stack, uint a)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  return make_float3(stack[a + 0], stack[a + 1], stack[a + 2]);
+  float *stack_a = stack + a;
+  return make_float3(stack_a[0], stack_a[1], stack_a[2]);
 }
 
 ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  stack[a + 0] = f.x;
-  stack[a + 1] = f.y;
-  stack[a + 2] = f.z;
+  float *stack_a = stack + a;
+  stack_a[0] = f.x;
+  stack_a[1] = f.y;
+  stack_a[2] = f.z;
 }
 
 ccl_device_inline float stack_load_float(float *stack, uint a)
@@ -105,14 +107,14 @@ ccl_device_inline bool stack_valid(uint a)
 
 /* Reading Nodes */
 
-ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset)
+ccl_device_inline uint4 read_node(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   (*offset)++;
   return node;
 }
 
-ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
+ccl_device_inline float4 read_node_float(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   float4 f = make_float4(__uint_as_float(node.x),
@@ -123,7 +125,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
   return f;
 }
 
-ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
+ccl_device_inline float4 fetch_node_float(const KernelGlobals *kg, int offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, offset);
   return make_float4(__uint_as_float(node.x),
@@ -217,26 +219,11 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ccl_addr_space PathState *state,
-                                      ccl_global float *buffer,
-                                      ShaderType type,
-                                      int path_flag)
-{
-  optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
-}
-extern "C" __device__ void __direct_callable__svm_eval_nodes(
-#else
-ccl_device_noinline void svm_eval_nodes(
-#endif
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ccl_global float *buffer,
-    ShaderType type,
-    int path_flag)
+template<uint node_feature_mask, ShaderType type>
+ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS,
+                               ShaderData *sd,
+                               ccl_global float *render_buffer,
+                               int path_flag)
 {
   float stack[SVM_STACK_SIZE];
   int offset = sd->shader & SHADER_MASK;
@@ -247,7 +234,6 @@ ccl_device_noinline void svm_eval_nodes(
     switch (node.x) {
       case NODE_END:
         return;
-#if NODES_GROUP(NODE_GROUP_LEVEL_0)
       case NODE_SHADER_JUMP: {
         if (type == SHADER_TYPE_SURFACE)
           offset = node.y;
@@ -260,13 +246,18 @@ ccl_device_noinline void svm_eval_nodes(
         break;
       }
       case NODE_CLOSURE_BSDF:
-        svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset);
+        offset = svm_node_closure_bsdf<node_feature_mask, type>(
+            kg, sd, stack, node, path_flag, offset);
         break;
       case NODE_CLOSURE_EMISSION:
-        svm_node_closure_emission(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_emission(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_BACKGROUND:
-        svm_node_closure_background(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_background(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_SET_WEIGHT:
         svm_node_closure_set_weight(sd, node.y, node.z, node.w);
@@ -275,7 +266,9 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_closure_weight(sd, stack, node.y);
         break;
       case NODE_EMISSION_WEIGHT:
-        svm_node_emission_weight(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_emission_weight(kg, sd, stack, node);
+        }
         break;
       case NODE_MIX_CLOSURE:
         svm_node_mix_closure(sd, stack, node);
@@ -295,86 +288,108 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_TEX_COORD:
-        svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+        offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
         break;
       case NODE_VALUE_F:
         svm_node_value_f(kg, sd, stack, node.y, node.z);
         break;
       case NODE_VALUE_V:
-        svm_node_value_v(kg, sd, stack, node.y, &offset);
+        offset = svm_node_value_v(kg, sd, stack, node.y, offset);
         break;
       case NODE_ATTR:
-        svm_node_attr(kg, sd, stack, node);
+        svm_node_attr<node_feature_mask>(kg, sd, stack, node);
         break;
       case NODE_VERTEX_COLOR:
         svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_GEOMETRY_BUMP_DX:
-        svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_GEOMETRY_BUMP_DY:
-        svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_SET_DISPLACEMENT:
-        svm_node_set_displacement(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_displacement(kg, sd, stack, node.y);
+        }
         break;
       case NODE_DISPLACEMENT:
-        svm_node_displacement(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_displacement(kg, sd, stack, node);
+        }
         break;
       case NODE_VECTOR_DISPLACEMENT:
-        svm_node_vector_displacement(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_vector_displacement(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_TEX_IMAGE:
-        svm_node_tex_image(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_image(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_IMAGE_BOX:
         svm_node_tex_image_box(kg, sd, stack, node);
         break;
       case NODE_TEX_NOISE:
-        svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_SET_BUMP:
-        svm_node_set_bump(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_bump(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DX:
-        svm_node_attr_bump_dx(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dx(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DY:
-        svm_node_attr_bump_dy(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dy(kg, sd, stack, node);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DX:
-        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DY:
-        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DX:
-        svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DY:
-        svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_CLOSURE_SET_NORMAL:
-        svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        }
         break;
-#    if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
       case NODE_ENTER_BUMP_EVAL:
-        svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        }
         break;
       case NODE_LEAVE_BUMP_EVAL:
-        svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        }
         break;
-#    endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-#  endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_HSV:
-        svm_node_hsv(kg, sd, stack, node, &offset);
+        svm_node_hsv(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_1)
       case NODE_CLOSURE_HOLDOUT:
         svm_node_closure_holdout(sd, stack, node);
         break;
@@ -384,22 +399,24 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_LAYER_WEIGHT:
         svm_node_layer_weight(sd, stack, node);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_CLOSURE_VOLUME:
-        svm_node_closure_volume(kg, sd, stack, node, type);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          svm_node_closure_volume<type>(kg, sd, stack, node);
+        }
         break;
       case NODE_PRINCIPLED_VOLUME:
-        svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_MATH:
-        svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_math(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_VECTOR_MATH:
-        svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_RGB_RAMP:
-        svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+        offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
         break;
       case NODE_GAMMA:
         svm_node_gamma(sd, stack, node.y, node.z, node.w);
@@ -408,7 +425,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_brightness(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_LIGHT_PATH:
-        svm_node_light_path(sd, state, stack, node.y, node.z, path_flag);
+        svm_node_light_path(INTEGRATOR_STATE_PASS, sd, stack, node.y, node.z, path_flag);
         break;
       case NODE_OBJECT_INFO:
         svm_node_object_info(kg, sd, stack, node.y, node.z);
@@ -416,22 +433,22 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_PARTICLE_INFO:
         svm_node_particle_info(kg, sd, stack, node.y, node.z);
         break;
-#  if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
+#if defined(__HAIR__)
       case NODE_HAIR_INFO:
-        svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(HAIR)) {
+          svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_2)
       case NODE_TEXTURE_MAPPING:
-        svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIN_MAX:
-        svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_CAMERA:
         svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
@@ -440,47 +457,46 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_tex_environment(kg, sd, stack, node);
         break;
       case NODE_TEX_SKY:
-        svm_node_tex_sky(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_sky(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_GRADIENT:
         svm_node_tex_gradient(sd, stack, node);
         break;
       case NODE_TEX_VORONOI:
-        svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_voronoi<node_feature_mask>(
+            kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_MUSGRAVE:
-        svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_WAVE:
-        svm_node_tex_wave(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_wave(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_MAGIC:
-        svm_node_tex_magic(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_magic(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_CHECKER:
         svm_node_tex_checker(kg, sd, stack, node);
         break;
       case NODE_TEX_BRICK:
-        svm_node_tex_brick(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_brick(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_WHITE_NOISE:
-        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_NORMAL:
-        svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_LIGHT_FALLOFF:
         svm_node_light_falloff(sd, stack, node);
         break;
       case NODE_IES:
-        svm_node_ies(kg, sd, stack, node, &offset);
+        svm_node_ies(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_3)
       case NODE_RGB_CURVES:
       case NODE_VECTOR_CURVES:
-        svm_node_curves(kg, sd, stack, node, &offset);
+        offset = svm_node_curves(kg, sd, stack, node, offset);
         break;
       case NODE_TANGENT:
         svm_node_tangent(kg, sd, stack, node);
@@ -492,7 +508,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_invert(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIX:
-        svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_SEPARATE_VECTOR:
         svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
@@ -501,10 +517,10 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_SEPARATE_HSV:
-        svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_COMBINE_HSV:
-        svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_VECTOR_ROTATE:
         svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
@@ -522,39 +538,36 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_blackbody(kg, sd, stack, node.y, node.z);
         break;
       case NODE_MAP_RANGE:
-        svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_CLAMP:
-        svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  ifdef __SHADER_RAYTRACE__
+#ifdef __SHADER_RAYTRACE__
       case NODE_BEVEL:
-        svm_node_bevel(kg, sd, state, stack, node);
+        svm_node_bevel<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
       case NODE_AMBIENT_OCCLUSION:
-        svm_node_ao(kg, sd, state, stack, node);
+        svm_node_ao<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
-#  endif /* __SHADER_RAYTRACE__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_4)
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_TEX_VOXEL:
-        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_AOV_START:
-        if (!svm_node_aov_check(state, buffer)) {
+        if (!svm_node_aov_check(path_flag, render_buffer)) {
           return;
         }
         break;
       case NODE_AOV_COLOR:
-        svm_node_aov_color(kg, sd, stack, node, buffer);
+        svm_node_aov_color(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
       case NODE_AOV_VALUE:
-        svm_node_aov_value(kg, sd, stack, node, buffer);
+        svm_node_aov_value(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index 4cb986b897a..34ac2cb8fbf 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -14,20 +14,25 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
-ccl_device_noinline float svm_ao(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float3 N,
-                                 ccl_addr_space PathState *state,
-                                 float max_dist,
-                                 int num_samples,
-                                 int flags)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float __direct_callable__svm_node_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float svm_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                           ShaderData *sd,
+                                                           float3 N,
+                                                           float max_dist,
+                                                           int num_samples,
+                                                           int flags)
 {
   if (flags & NODE_AO_GLOBAL_RADIUS) {
-    max_dist = kernel_data.background.ao_distance;
+    max_dist = kernel_data.integrator.ao_bounces_distance;
   }
 
   /* Early out if no sampling needed. */
@@ -47,11 +52,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   float3 T, B;
   make_orthonormals(N, &T, &B);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   int unoccluded = 0;
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     float2 d = concentric_sample_disk(disk_u, disk_v);
     float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
@@ -62,8 +70,8 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
     ray.D = D.x * T + D.y * B + D.z * N;
     ray.t = max_dist;
     ray.time = sd->time;
-    ray.dP = sd->dP;
-    ray.dD = differential3_zero();
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
 
     if (flags & NODE_AO_ONLY_LOCAL) {
       if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
@@ -81,8 +89,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   return ((float)unoccluded) / num_samples;
 }
 
-ccl_device void svm_node_ao(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint flags, dist_offset, normal_offset, out_ao_offset;
   svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
@@ -92,7 +106,16 @@ ccl_device void svm_node_ao(
 
   float dist = stack_load_float_default(stack, dist_offset, node.w);
   float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
-  float ao = svm_ao(kg, sd, normal, state, dist, samples, flags);
+
+  float ao = 1.0f;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    ao = optixDirectCall<float>(0, INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  else
+    ao = svm_ao(INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  endif
+  }
 
   if (stack_valid(out_ao_offset)) {
     stack_store_float(stack, out_ao_offset, ao);
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
index 899e466d099..26dec9717b3 100644
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -14,36 +14,50 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state,
-                                          ccl_global float *buffer)
+ccl_device_inline bool svm_node_aov_check(const int path_flag, ccl_global float *render_buffer)
 {
-  int path_flag = state->flag;
-
   bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
 
-  return ((buffer != NULL) && is_primary);
+  return ((render_buffer != NULL) && is_primary);
 }
 
-ccl_device void svm_node_aov_color(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_color(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float3 val = stack_load_float3(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z,
-                             make_float4(val.x, val.y, val.z, 1.0f));
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_color + node.z);
+    kernel_write_pass_float3(buffer, make_float3(val.x, val.y, val.z));
   }
 }
 
-ccl_device void svm_node_aov_value(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_value(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float val = stack_load_float(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val);
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_value + node.z);
+    kernel_write_pass_float(buffer, val);
   }
 }
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 62740824ad1..5f94b20af73 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,8 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute Node */
 
-ccl_device AttributeDescriptor svm_node_attr_init(
-    KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeOutputType *type, uint *out_offset)
+ccl_device AttributeDescriptor svm_node_attr_init(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  uint4 node,
+                                                  NodeAttributeOutputType *type,
+                                                  uint *out_offset)
 {
   *out_offset = node.z;
   *type = (NodeAttributeOutputType)node.w;
@@ -44,31 +47,37 @@ ccl_device AttributeDescriptor svm_node_attr_init(
   return desc;
 }
 
-ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+template<uint node_feature_mask>
+ccl_device_noinline void svm_node_attr(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 #ifdef __VOLUME__
-  /* Volumes
-   * NOTE: moving this into its own node type might help improve performance. */
-  if (primitive_is_volume_attribute(sd, desc)) {
-    const float4 value = volume_attribute_float4(kg, sd, desc);
+  if (KERNEL_NODES_FEATURE(VOLUME)) {
+    /* Volumes
+     * NOTE: moving this into its own node type might help improve performance. */
+    if (primitive_is_volume_attribute(sd, desc)) {
+      const float4 value = volume_attribute_float4(kg, sd, desc);
 
-    if (type == NODE_ATTR_OUTPUT_FLOAT) {
-      const float f = volume_attribute_value_to_float(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
-      const float3 f = volume_attribute_value_to_float3(value);
-      stack_store_float3(stack, out_offset, f);
+      if (type == NODE_ATTR_OUTPUT_FLOAT) {
+        const float f = volume_attribute_value_to_float(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
+        const float3 f = volume_attribute_value_to_float3(value);
+        stack_store_float3(stack, out_offset, f);
+      }
+      else {
+        const float f = volume_attribute_value_to_alpha(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      return;
     }
-    else {
-      const float f = volume_attribute_value_to_alpha(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    return;
   }
 #endif
 
@@ -139,7 +148,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
   }
 }
 
-ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dx(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
@@ -232,7 +244,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *
   }
 }
 
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dy(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index bf5957ec9e4..aab089d19ea 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -14,21 +14,95 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
+/* Planar Cubic BSSRDF falloff, reused for bevel.
+ *
+ * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
+ * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
+ * far as I can tell has no closed form solution. So we get an iterative solution
+ * instead with newton-raphson. */
+
+ccl_device float svm_bevel_cubic_eval(const float radius, float r)
+{
+  const float Rm = radius;
+
+  if (r >= Rm)
+    return 0.0f;
+
+  /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
+  const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
+  const float f = Rm - r;
+  const float num = f * f * f;
+
+  return (10.0f * num) / (Rm5 * M_PI_F);
+}
+
+ccl_device float svm_bevel_cubic_pdf(const float radius, float r)
+{
+  return svm_bevel_cubic_eval(radius, r);
+}
+
+/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
+ccl_device_forceinline float svm_bevel_cubic_quintic_root_find(float xi)
+{
+  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
+   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
+   * should not be too bad */
+  const float tolerance = 1e-6f;
+  const int max_iteration_count = 10;
+  float x = 0.25f;
+  int i;
+
+  for (i = 0; i < max_iteration_count; i++) {
+    float x2 = x * x;
+    float x3 = x2 * x;
+    float nx = (1.0f - x);
+
+    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
+    float f_ = 20.0f * (x * nx) * (nx * nx);
+
+    if (fabsf(f) < tolerance || f_ == 0.0f)
+      break;
+
+    x = saturate(x - f / f_);
+  }
+
+  return x;
+}
+
+ccl_device void svm_bevel_cubic_sample(const float radius, float xi, float *r, float *h)
+{
+  float Rm = radius;
+  float r_ = svm_bevel_cubic_quintic_root_find(xi);
+
+  r_ *= Rm;
+  *r = r_;
+
+  /* h^2 + r^2 = Rm^2 */
+  *h = safe_sqrtf(Rm * Rm - r_ * r_);
+}
+
 /* Bevel shader averaging normals from nearby surfaces.
  *
  * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
  */
 
-ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ccl_addr_space PathState *state,
-                                     float radius,
-                                     int num_samples)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float3 svm_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                               ShaderData *sd,
+                                                               float radius,
+                                                               int num_samples)
 {
   /* Early out if no sampling needed. */
   if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@@ -41,21 +115,27 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   }
 
   /* Don't bevel for blurry indirect rays. */
-  if (state->min_ray_pdf < 8.0f) {
+  if (INTEGRATOR_STATE(path, min_ray_pdf) < 8.0f) {
     return sd->N;
   }
 
   /* Setup for multi intersection. */
   LocalIntersection isect;
-  uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e);
+  uint lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                  INTEGRATOR_STATE(path, rng_offset),
+                                  INTEGRATOR_STATE(path, sample),
+                                  0x64c6a40e);
 
   /* Sample normals from surrounding points on surface. */
   float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     /* Pick random axis in local frame and point on disk. */
     float3 disk_N, disk_T, disk_B;
@@ -97,7 +177,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     float disk_height;
 
     /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
-    bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height);
+    svm_bevel_cubic_sample(radius, disk_r, &disk_r, &disk_height);
 
     float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
 
@@ -106,8 +186,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     ray->P = sd->P + disk_N * disk_height + disk_P;
     ray->D = -disk_N;
     ray->t = 2.0f * disk_height;
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
+    ray->dP = differential_zero_compact();
+    ray->dD = differential_zero_compact();
     ray->time = sd->time;
 
     /* Intersect with the same object. if multiple intersections are found it
@@ -120,14 +200,16 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       /* Quickly retrieve P and Ng without setting up ShaderData. */
       float3 hit_P;
       if (sd->type & PRIMITIVE_TRIANGLE) {
-        hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray);
+        hit_P = triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim);
       }
 #  ifdef __OBJECT_MOTION__
       else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
         float3 verts[3];
         motion_triangle_vertices(
             kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts);
-        hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts);
+        hit_P = motion_triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim, verts);
       }
 #  endif /* __OBJECT_MOTION__ */
 
@@ -183,8 +265,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       float r = len(hit_P - sd->P);
 
       /* Compute weight. */
-      float pdf = bssrdf_cubic_pdf(radius, 0.0f, r);
-      float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r);
+      float pdf = svm_bevel_cubic_pdf(radius, r);
+      float disk_pdf = svm_bevel_cubic_pdf(radius, disk_r);
 
       w *= pdf / disk_pdf;
 
@@ -198,19 +280,34 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
 }
 
-ccl_device void svm_node_bevel(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint num_samples, radius_offset, normal_offset, out_offset;
   svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
 
   float radius = stack_load_float(stack, radius_offset);
-  float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
 
-  if (stack_valid(normal_offset)) {
-    /* Preserve input normal. */
-    float3 ref_N = stack_load_float3(stack, normal_offset);
-    bevel_N = normalize(ref_N + (bevel_N - sd->N));
+  float3 bevel_N = sd->N;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    bevel_N = optixDirectCall<float3>(1, INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  else
+    bevel_N = svm_bevel(INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  endif
+
+    if (stack_valid(normal_offset)) {
+      /* Preserve input normal. */
+      float3 ref_N = stack_load_float3(stack, normal_offset);
+      bevel_N = normalize(ref_N + (bevel_N - sd->N));
+    }
   }
 
   stack_store_float3(stack, out_offset, bevel_N);
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index adfc50d961e..96b3703b954 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -34,8 +34,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Blackbody Node */
 
-ccl_device void svm_node_blackbody(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
+ccl_device_noinline void svm_node_blackbody(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint temperature_offset,
+                                            uint col_offset)
 {
   /* Input */
   float temperature = stack_load_float(stack, temperature_offset);
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 6984afa30a5..dca1b220dd5 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -72,12 +72,12 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
   return make_float2(tint, mortar);
 }
 
-ccl_device void svm_node_tex_brick(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_brick(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
-  uint4 node4 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
+  uint4 node4 = read_node(kg, &offset);
 
   /* Input and Output Sockets */
   uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
@@ -133,6 +133,7 @@ ccl_device void svm_node_tex_brick(
     stack_store_float3(stack, color_offset, color1 * (1.0f - f) + mortar * f);
   if (stack_valid(fac_offset))
     stack_store_float(stack, fac_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9554b5946fb..2ed812acd71 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_brightness(
+ccl_device_noinline void svm_node_brightness(
     ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
 {
   uint bright_offset, contrast_offset;
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index c9d430a2bba..8672839dbab 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Bump Eval Nodes */
 
-ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_enter_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* save state */
   stack_store_float3(stack, offset + 0, sd->P);
@@ -45,10 +45,10 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
   }
 }
 
-ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_leave_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* restore state */
   sd->P = stack_load_float3(stack, offset + 0);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 21a17acf5f1..40c0edcdad0 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_camera(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint out_vector,
-                                uint out_zdepth,
-                                uint out_distance)
+ccl_device_noinline void svm_node_camera(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint out_vector,
+                                         uint out_zdepth,
+                                         uint out_distance)
 {
   float distance;
   float zdepth;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index d54cb73df91..a9919c9ddc9 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -32,7 +32,10 @@ ccl_device float svm_checker(float3 p)
   return ((xi % 2 == yi % 2) == (zi % 2)) ? 1.0f : 0.0f;
 }
 
-ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_checker(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint4 node)
 {
   uint co_offset, color1_offset, color2_offset, scale_offset;
   uint color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h
index a85fd82754e..656bd31c085 100644
--- a/intern/cycles/kernel/svm/svm_clamp.h
+++ b/intern/cycles/kernel/svm/svm_clamp.h
@@ -18,18 +18,18 @@ CCL_NAMESPACE_BEGIN
 
 /* Clamp Node */
 
-ccl_device void svm_node_clamp(KernelGlobals *kg,
-                               ShaderData *sd,
-                               float *stack,
-                               uint value_stack_offset,
-                               uint parameters_stack_offsets,
-                               uint result_stack_offset,
-                               int *offset)
+ccl_device_noinline int svm_node_clamp(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint value_stack_offset,
+                                       uint parameters_stack_offsets,
+                                       uint result_stack_offset,
+                                       int offset)
 {
   uint min_stack_offset, max_stack_offset, type;
   svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type);
 
-  uint4 defaults = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float min = stack_load_float_default(stack, min_stack_offset, defaults.x);
@@ -41,6 +41,7 @@ ccl_device void svm_node_clamp(KernelGlobals *kg,
   else {
     stack_store_float(stack, result_stack_offset, clamp(value, min, max));
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index bbe8d72edf0..e2f6dde4ace 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -57,13 +57,9 @@ ccl_device void svm_node_glass_setup(
   }
 }
 
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint4 node,
-                                      ShaderType shader_type,
-                                      int path_flag,
-                                      int *offset)
+template<uint node_feature_mask, ShaderType shader_type>
+ccl_device_noinline int svm_node_closure_bsdf(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
   uint type, param1_offset, param2_offset;
 
@@ -73,19 +69,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                                        1.0f);
 
   /* note we read this extra node before weight check, so offset is added */
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
 
   /* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */
-  if (mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) {
+  if ((!KERNEL_NODES_FEATURE(BSDF) || shader_type != SHADER_TYPE_SURFACE) || mix_weight == 0.0f) {
     if (type == CLOSURE_BSDF_PRINCIPLED_ID) {
       /* Read all principled BSDF extra data to get the right offset. */
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
     }
 
-    return;
+    return offset;
   }
 
   float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N;
@@ -102,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
           sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset,
           eta_offset, transmission_offset, anisotropic_rotation_offset,
           transmission_roughness_offset;
-      uint4 data_node2 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
 
       float3 T = stack_load_float3(stack, data_node.y);
       svm_unpack_node_uchar4(data_node.z,
@@ -158,7 +154,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       float specular_weight = (1.0f - final_transmission);
 
       // get the base color
-      uint4 data_base_color = read_node(kg, offset);
+      uint4 data_base_color = read_node(kg, &offset);
       float3 base_color = stack_valid(data_base_color.x) ?
                               stack_load_float3(stack, data_base_color.x) :
                               make_float3(__uint_as_float(data_base_color.y),
@@ -166,16 +162,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                           __uint_as_float(data_base_color.w));
 
       // get the additional clearcoat normal and subsurface scattering radius
-      uint4 data_cn_ssr = read_node(kg, offset);
+      uint4 data_cn_ssr = read_node(kg, &offset);
       float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ?
                                     stack_load_float3(stack, data_cn_ssr.x) :
                                     sd->N;
       float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
                                      stack_load_float3(stack, data_cn_ssr.y) :
                                      make_float3(1.0f, 1.0f, 1.0f);
+      float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) :
+                                                          1.4f;
+      float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ?
+                                        stack_load_float(stack, data_cn_ssr.w) :
+                                        0.0f;
 
       // get the subsurface color
-      uint4 data_subsurface_color = read_node(kg, offset);
+      uint4 data_subsurface_color = read_node(kg, &offset);
       float3 subsurface_color = stack_valid(data_subsurface_color.x) ?
                                     stack_load_float3(stack, data_subsurface_color.x) :
                                     make_float3(__uint_as_float(data_subsurface_color.y),
@@ -222,16 +223,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
           if (bssrdf) {
             bssrdf->radius = subsurface_radius * subsurface;
-            bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID) ?
-                                 subsurface_color :
-                                 mixed_ss_base_color;
-            bssrdf->texture_blur = 0.0f;
-            bssrdf->sharpness = 0.0f;
+            bssrdf->albedo = mixed_ss_base_color;
             bssrdf->N = N;
             bssrdf->roughness = roughness;
 
+            /* Clamps protecting against bad/extreme and non physical values. */
+            subsurface_ior = clamp(subsurface_ior, 1.01f, 3.8f);
+            bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
             /* setup bsdf */
-            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method);
+            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method, subsurface_ior);
           }
         }
       }
@@ -733,9 +734,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     }
 #ifdef __HAIR__
     case CLOSURE_BSDF_HAIR_PRINCIPLED_ID: {
-      uint4 data_node2 = read_node(kg, offset);
-      uint4 data_node3 = read_node(kg, offset);
-      uint4 data_node4 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
+      uint4 data_node3 = read_node(kg, &offset);
+      uint4 data_node4 = read_node(kg, &offset);
 
       float3 weight = sd->svm_closure_weight * mix_weight;
 
@@ -878,10 +879,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 #endif /* __HAIR__ */
 
 #ifdef __SUBSURFACE__
-    case CLOSURE_BSSRDF_CUBIC_ID:
-    case CLOSURE_BSSRDF_GAUSSIAN_ID:
-    case CLOSURE_BSSRDF_BURLEY_ID:
-    case CLOSURE_BSSRDF_RANDOM_WALK_ID: {
+    case CLOSURE_BSSRDF_RANDOM_WALK_ID:
+    case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
       Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
 
@@ -894,11 +893,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
         bssrdf->radius = stack_load_float3(stack, data_node.z) * param1;
         bssrdf->albedo = sd->svm_closure_weight;
-        bssrdf->texture_blur = param2;
-        bssrdf->sharpness = stack_load_float(stack, data_node.w);
         bssrdf->N = N;
-        bssrdf->roughness = 0.0f;
-        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+        bssrdf->roughness = FLT_MAX;
+
+        const float subsurface_ior = clamp(param2, 1.01f, 3.8f);
+        const float subsurface_anisotropy = stack_load_float(stack, data_node.w);
+        bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
+        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, subsurface_ior);
       }
 
       break;
@@ -907,10 +909,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     default:
       break;
   }
+
+  return offset;
 }
 
-ccl_device void svm_node_closure_volume(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type)
+template<ShaderType shader_type>
+ccl_device_noinline void svm_node_closure_volume(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint4 node)
 {
 #ifdef __VOLUME__
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
@@ -961,21 +968,17 @@ ccl_device void svm_node_closure_volume(
 #endif
 }
 
-ccl_device void svm_node_principled_volume(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           float *stack,
-                                           uint4 node,
-                                           ShaderType shader_type,
-                                           int path_flag,
-                                           int *offset)
+template<ShaderType shader_type>
+ccl_device_noinline int svm_node_principled_volume(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
 #ifdef __VOLUME__
-  uint4 value_node = read_node(kg, offset);
-  uint4 attr_node = read_node(kg, offset);
+  uint4 value_node = read_node(kg, &offset);
+  uint4 attr_node = read_node(kg, &offset);
 
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
   if (shader_type != SHADER_TYPE_VOLUME) {
-    return;
+    return offset;
   }
 
   uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
@@ -985,7 +988,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
                                                        1.0f);
 
   if (mix_weight == 0.0f) {
-    return;
+    return offset;
   }
 
   /* Compute density. */
@@ -1034,7 +1037,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
   /* Compute emission. */
   if (path_flag & PATH_RAY_SHADOW) {
     /* Don't need emission for shadows. */
-    return;
+    return offset;
   }
 
   uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
@@ -1074,9 +1077,10 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
     }
   }
 #endif
+  return offset;
 }
 
-ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1093,7 +1097,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
   emission_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1110,7 +1114,7 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
   background_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
 
@@ -1145,14 +1149,13 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
 ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
 {
   float3 weight = stack_load_float3(stack, weight_offset);
-
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_emission_weight(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_emission_weight(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint color_offset = node.y;
   uint strength_offset = node.z;
@@ -1163,7 +1166,7 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg,
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 {
   /* fetch weight from blend input, previous mix closures,
    * and write to stack to be used by closure nodes later */
@@ -1186,7 +1189,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 /* (Bump) normal */
 
 ccl_device void svm_node_set_normal(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
   float3 normal = stack_load_float3(stack, in_direction);
   sd->N = normal;
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 5df6c9fb755..37d40167ccc 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Conversion Nodes */
 
-ccl_device void svm_node_convert(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
+ccl_device_noinline void svm_node_convert(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
 {
   switch (type) {
     case NODE_CONVERT_FI: {
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 250fac6bcb8..a1d952173d8 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -14,11 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Bump Node */
 
-ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_set_bump(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint4 node)
 {
 #ifdef __RAY_DIFFERENTIALS__
   /* get normal input */
@@ -83,7 +88,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 
 /* Displacement Node */
 
-ccl_device void svm_node_set_displacement(KernelGlobals *kg,
+ccl_device void svm_node_set_displacement(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           float *stack,
                                           uint fac_offset)
@@ -92,7 +97,10 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg,
   sd->P += dP;
 }
 
-ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_displacement(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   uint height_offset, midlevel_offset, scale_offset, normal_offset;
   svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
@@ -119,10 +127,10 @@ ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *
   stack_store_float3(stack, node.z, dP);
 }
 
-ccl_device void svm_node_vector_displacement(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_vector_displacement(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
   uint space = data_node.x;
 
   uint vector_offset, midlevel_offset, scale_offset, displacement_offset;
@@ -164,6 +172,7 @@ ccl_device void svm_node_vector_displacement(
   }
 
   stack_store_float3(stack, displacement_offset, dP);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 96d602e35bf..b5ecdbe2abf 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Fresnel Node */
 
-ccl_device void svm_node_fresnel(
+ccl_device_noinline void svm_node_fresnel(
     ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
 {
   uint normal_offset, out_offset;
@@ -37,7 +37,7 @@ ccl_device void svm_node_fresnel(
 
 /* Layer Weight Node */
 
-ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 {
   uint blend_offset = node.y;
   uint blend_value = node.z;
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index 65eb08eb0eb..f6fafdee941 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_gamma(
+ccl_device_noinline void svm_node_gamma(
     ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
 {
   float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index e48e96dcfa4..10e9f291d0e 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-ccl_device_inline void svm_node_geometry(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float3 data;
 
@@ -51,8 +51,8 @@ ccl_device_inline void svm_node_geometry(
   stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device void svm_node_geometry_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -75,8 +75,8 @@ ccl_device void svm_node_geometry_bump_dx(
 #endif
 }
 
-ccl_device void svm_node_geometry_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -101,8 +101,8 @@ ccl_device void svm_node_geometry_bump_dy(
 
 /* Object Info */
 
-ccl_device void svm_node_object_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_object_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
 
@@ -140,8 +140,8 @@ ccl_device void svm_node_object_info(
 
 /* Particle Info */
 
-ccl_device void svm_node_particle_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_particle_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   switch (type) {
     case NODE_INFO_PAR_INDEX: {
@@ -199,8 +199,8 @@ ccl_device void svm_node_particle_info(
 
 /* Hair Info */
 
-ccl_device void svm_node_hair_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_hair_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
   float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 08304bc47e8..cd15f7097e7 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -60,7 +60,7 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type)
   return 0.0f;
 }
 
-ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 {
   uint type, co_offset, color_offset, fac_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index c299cf58c7f..6f49a8385aa 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -19,8 +19,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_hsv(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_hsv(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint in_color_offset, fac_offset, out_color_offset;
   uint hue_offset, sat_offset, val_offset;
diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h
index 56c804b44d0..9c13734ecf0 100644
--- a/intern/cycles/kernel/svm/svm_ies.h
+++ b/intern/cycles/kernel/svm/svm_ies.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 /* IES Light */
 
 ccl_device_inline float interpolate_ies_vertical(
-    KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
+    const KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
 {
   /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end
    * of v (corresponding to the north pole) would result in artifacts. The proper way of dealing
@@ -39,7 +39,7 @@ ccl_device_inline float interpolate_ies_vertical(
   return cubic_interp(a, b, c, d, v_frac);
 }
 
-ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
+ccl_device_inline float kernel_ies_interp(const KernelGlobals *kg,
                                           int slot,
                                           float h_angle,
                                           float v_angle)
@@ -98,8 +98,10 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
   return max(cubic_interp(a, b, c, d, h_frac), 0.0f);
 }
 
-ccl_device void svm_node_ies(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_ies(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint vector_offset, strength_offset, fac_offset, slot = node.z;
   svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 9348ddabde5..a344f36977a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
+ccl_device float4 svm_image_texture(const KernelGlobals *kg, int id, float x, float y, uint flags)
 {
   if (id == -1) {
     return make_float4(
@@ -44,8 +44,8 @@ ccl_device_inline float3 texco_remap_square(float3 co)
   return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
 }
 
-ccl_device void svm_node_tex_image(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_image(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, out_offset, alpha_offset, flags;
 
@@ -71,7 +71,7 @@ ccl_device void svm_node_tex_image(
   int num_nodes = (int)node.y;
   if (num_nodes > 0) {
     /* Remember the offset of the node following the tile nodes. */
-    int next_offset = (*offset) + num_nodes;
+    int next_offset = offset + num_nodes;
 
     /* Find the tile that the UV lies in. */
     int tx = (int)tex_co.x;
@@ -83,7 +83,7 @@ ccl_device void svm_node_tex_image(
 
       /* Find the index of the tile. */
       for (int i = 0; i < num_nodes; i++) {
-        uint4 tile_node = read_node(kg, offset);
+        uint4 tile_node = read_node(kg, &offset);
         if (tile_node.x == tile) {
           id = tile_node.y;
           break;
@@ -102,7 +102,7 @@ ccl_device void svm_node_tex_image(
     }
 
     /* Skip over the remaining nodes. */
-    *offset = next_offset;
+    offset = next_offset;
   }
   else {
     id = -num_nodes;
@@ -114,9 +114,13 @@ ccl_device void svm_node_tex_image(
     stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, f.w);
+  return offset;
 }
 
-ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_image_box(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float *stack,
+                                                uint4 node)
 {
   /* get object space normal */
   float3 N = sd->N;
@@ -215,10 +219,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
     stack_store_float(stack, alpha_offset, f.w);
 }
 
-ccl_device void svm_node_tex_environment(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_tex_environment(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint id = node.y;
   uint co_offset, out_offset, alpha_offset, flags;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 02024742b13..27cdaaff473 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -21,7 +21,7 @@ ccl_device float invert(float color, float factor)
   return factor * (1.0f - color) + (1.0f - factor) * color;
 }
 
-ccl_device void svm_node_invert(
+ccl_device_noinline void svm_node_invert(
     ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
 {
   float factor = stack_load_float(stack, in_fac);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 768c65918cd..49fabad1cc5 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Light Path Node */
 
-ccl_device void svm_node_light_path(ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    float *stack,
-                                    uint type,
-                                    uint out_offset,
-                                    int path_flag)
+ccl_device_noinline void svm_node_light_path(INTEGRATOR_STATE_CONST_ARGS,
+                                             const ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint out_offset,
+                                             int path_flag)
 {
   float info = 0.0f;
 
@@ -58,21 +58,47 @@ ccl_device void svm_node_light_path(ShaderData *sd,
     case NODE_LP_ray_length:
       info = sd->ray_length;
       break;
-    case NODE_LP_ray_depth:
-      info = (float)state->bounce;
+    case NODE_LP_ray_depth: {
+      /* Read bounce from difference location depending if this is a shadow
+       * path. It's a bit dubious to have integrate state details leak into
+       * this function but hard to avoid currently. */
+      int bounce = (INTEGRATOR_STATE_IS_NULL)    ? 0 :
+                   (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(shadow_path, bounce) :
+                                                   INTEGRATOR_STATE(path, bounce);
+
+      /* For background, light emission and shadow evaluation we from a
+       * surface or volume we are effective one bounce further. */
+      if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+        bounce++;
+      }
+
+      info = (float)bounce;
       break;
+    }
+      /* TODO */
+    case NODE_LP_ray_transparent: {
+      const int bounce = (INTEGRATOR_STATE_IS_NULL) ?
+                             0 :
+                         (path_flag & PATH_RAY_SHADOW) ?
+                             INTEGRATOR_STATE(shadow_path, transparent_bounce) :
+                             INTEGRATOR_STATE(path, transparent_bounce);
+
+      info = (float)bounce;
+      break;
+    }
+#if 0
     case NODE_LP_ray_diffuse:
       info = (float)state->diffuse_bounce;
       break;
     case NODE_LP_ray_glossy:
       info = (float)state->glossy_bounce;
       break;
-    case NODE_LP_ray_transparent:
-      info = (float)state->transparent_bounce;
-      break;
+#endif
+#if 0
     case NODE_LP_ray_transmission:
       info = (float)state->transmission_bounce;
       break;
+#endif
   }
 
   stack_store_float(stack, out_offset, info);
@@ -80,7 +106,7 @@ ccl_device void svm_node_light_path(ShaderData *sd,
 
 /* Light Falloff Node */
 
-ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 {
   uint strength_offset, out_offset, smooth_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 9c160e6d8cc..8784c760860 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -87,8 +87,8 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
   return make_float3(0.5f - x, 0.5f - y, 0.5f - z);
 }
 
-ccl_device void svm_node_tex_magic(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_magic(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint depth;
   uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
@@ -96,7 +96,7 @@ ccl_device void svm_node_tex_magic(
   svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset);
   svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
 
-  uint4 node2 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
   float3 co = stack_load_float3(stack, co_offset);
   float scale = stack_load_float_default(stack, scale_offset, node2.x);
   float distortion = stack_load_float_default(stack, distortion_offset, node2.y);
@@ -107,6 +107,7 @@ ccl_device void svm_node_tex_magic(
     stack_store_float(stack, fac_offset, average(color));
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, color);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h
index 533a631c837..c8684981e31 100644
--- a/intern/cycles/kernel/svm/svm_map_range.h
+++ b/intern/cycles/kernel/svm/svm_map_range.h
@@ -24,13 +24,13 @@ ccl_device_inline float smootherstep(float edge0, float edge1, float x)
   return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
 }
 
-ccl_device void svm_node_map_range(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint value_stack_offset,
-                                   uint parameters_stack_offsets,
-                                   uint results_stack_offsets,
-                                   int *offset)
+ccl_device_noinline int svm_node_map_range(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint value_stack_offset,
+                                           uint parameters_stack_offsets,
+                                           uint results_stack_offsets,
+                                           int offset)
 {
   uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset;
   uint type_stack_offset, steps_stack_offset, result_stack_offset;
@@ -42,8 +42,8 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
   svm_unpack_node_uchar3(
       results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset);
 
-  uint4 defaults = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x);
@@ -83,6 +83,7 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
     result = 0.0f;
   }
   stack_store_float(stack, result_stack_offset, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 6e19c859e19..fcc724405f5 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,13 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Mapping Node */
 
-ccl_device void svm_node_mapping(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float *stack,
-                                 uint type,
-                                 uint inputs_stack_offsets,
-                                 uint result_stack_offset,
-                                 int *offset)
+ccl_device_noinline void svm_node_mapping(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint type,
+                                          uint inputs_stack_offsets,
+                                          uint result_stack_offset)
 {
   uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset;
   svm_unpack_node_uchar4(inputs_stack_offsets,
@@ -44,30 +43,40 @@ ccl_device void svm_node_mapping(KernelGlobals *kg,
 
 /* Texture Mapping */
 
-ccl_device void svm_node_texture_mapping(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_texture_mapping(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint vec_offset,
+                                                 uint out_offset,
+                                                 int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
   Transform tfm;
-  tfm.x = read_node_float(kg, offset);
-  tfm.y = read_node_float(kg, offset);
-  tfm.z = read_node_float(kg, offset);
+  tfm.x = read_node_float(kg, &offset);
+  tfm.y = read_node_float(kg, &offset);
+  tfm.z = read_node_float(kg, &offset);
 
   float3 r = transform_point(&tfm, v);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
-ccl_device void svm_node_min_max(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_min_max(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint vec_offset,
+                                         uint out_offset,
+                                         int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
-  float3 mn = float4_to_float3(read_node_float(kg, offset));
-  float3 mx = float4_to_float3(read_node_float(kg, offset));
+  float3 mn = float4_to_float3(read_node_float(kg, &offset));
+  float3 mx = float4_to_float3(read_node_float(kg, &offset));
 
   float3 r = min(max(mn, v), mx);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 733ea28f9e5..99e7a8f2bda 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_math(KernelGlobals *kg,
-                              ShaderData *sd,
-                              float *stack,
-                              uint type,
-                              uint inputs_stack_offsets,
-                              uint result_stack_offset,
-                              int *offset)
+ccl_device_noinline void svm_node_math(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint type,
+                                       uint inputs_stack_offsets,
+                                       uint result_stack_offset)
 {
   uint a_stack_offset, b_stack_offset, c_stack_offset;
   svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset);
@@ -35,13 +34,13 @@ ccl_device void svm_node_math(KernelGlobals *kg,
   stack_store_float(stack, result_stack_offset, result);
 }
 
-ccl_device void svm_node_vector_math(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint type,
-                                     uint inputs_stack_offsets,
-                                     uint outputs_stack_offsets,
-                                     int *offset)
+ccl_device_noinline int svm_node_vector_math(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint inputs_stack_offsets,
+                                             uint outputs_stack_offsets,
+                                             int offset)
 {
   uint value_stack_offset, vector_stack_offset;
   uint a_stack_offset, b_stack_offset, param1_stack_offset;
@@ -60,7 +59,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
   /* 3 Vector Operators */
   if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD ||
       type == NODE_VECTOR_MATH_MULTIPLY_ADD) {
-    uint4 extra_node = read_node(kg, offset);
+    uint4 extra_node = read_node(kg, &offset);
     c = stack_load_float3(stack, extra_node.x);
   }
 
@@ -70,6 +69,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
     stack_store_float(stack, value_stack_offset, value);
   if (stack_valid(vector_stack_offset))
     stack_store_float3(stack, vector_stack_offset, vector);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 15114bfd5e4..3e38080977f 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -18,16 +18,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Node */
 
-ccl_device void svm_node_mix(KernelGlobals *kg,
-                             ShaderData *sd,
-                             float *stack,
-                             uint fac_offset,
-                             uint c1_offset,
-                             uint c2_offset,
-                             int *offset)
+ccl_device_noinline int svm_node_mix(const KernelGlobals *kg,
+                                     ShaderData *sd,
+                                     float *stack,
+                                     uint fac_offset,
+                                     uint c1_offset,
+                                     uint c2_offset,
+                                     int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
 
   float fac = stack_load_float(stack, fac_offset);
   float3 c1 = stack_load_float3(stack, c1_offset);
@@ -35,6 +35,7 @@ ccl_device void svm_node_mix(KernelGlobals *kg,
   float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2);
 
   stack_store_float3(stack, node1.z, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 571f62fe27f..03a8b68b3ef 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -700,13 +700,13 @@ ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d(
   return value;
 }
 
-ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint offsets1,
-                                      uint offsets2,
-                                      uint offsets3,
-                                      int *offset)
+ccl_device_noinline int svm_node_tex_musgrave(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint offsets1,
+                                              uint offsets2,
+                                              uint offsets3,
+                                              int offset)
 {
   uint type, dimensions, co_stack_offset, w_stack_offset;
   uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset;
@@ -720,8 +720,8 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
                          &lacunarity_stack_offset);
   svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 co = stack_load_float3(stack, co_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -844,6 +844,7 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
   }
 
   stack_store_float(stack, fac_stack_offset, fac);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 94d8bfde555..ecb4df6afdf 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -330,7 +330,7 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  *               |__________________________|
  *
  */
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
   ssei XY;
   ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
@@ -447,7 +447,7 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  *     v7      (1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -501,7 +501,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
@@ -585,7 +585,7 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  *                 |__________________________|
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -637,7 +637,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 61fd9553802..29b262ac06e 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -140,13 +140,13 @@ ccl_device void noise_texture_4d(float4 co,
   }
 }
 
-ccl_device void svm_node_tex_noise(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint dimensions,
-                                   uint offsets1,
-                                   uint offsets2,
-                                   int *offset)
+ccl_device_noinline int svm_node_tex_noise(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint dimensions,
+                                           uint offsets1,
+                                           uint offsets2,
+                                           int offset)
 {
   uint vector_stack_offset, w_stack_offset, scale_stack_offset;
   uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset;
@@ -160,8 +160,8 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
                          &value_stack_offset,
                          &color_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 vector = stack_load_float3(stack, vector_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -212,6 +212,7 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
   if (stack_valid(color_stack_offset)) {
     stack_store_float3(stack, color_stack_offset, color);
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 4cd3eab0ed2..724b5f281f9 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -16,16 +16,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_normal(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint in_normal_offset,
-                                uint out_normal_offset,
-                                uint out_dot_offset,
-                                int *offset)
+ccl_device_noinline int svm_node_normal(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        float *stack,
+                                        uint in_normal_offset,
+                                        uint out_normal_offset,
+                                        uint out_dot_offset,
+                                        int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 normal = stack_load_float3(stack, in_normal_offset);
 
   float3 direction;
@@ -39,6 +39,7 @@ ccl_device void svm_node_normal(KernelGlobals *kg,
 
   if (stack_valid(out_dot_offset))
     stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 85ccf39144b..e92df3c093c 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,8 +21,12 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
-ccl_device_inline float4 rgb_ramp_lookup(
-    KernelGlobals *kg, int offset, float f, bool interpolate, bool extrapolate, int table_size)
+ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg,
+                                         int offset,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
   if ((f < 0.0f || f > 1.0f) && extrapolate) {
     float4 t0, dy;
@@ -53,34 +57,35 @@ ccl_device_inline float4 rgb_ramp_lookup(
   return a;
 }
 
-ccl_device void svm_node_rgb_ramp(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_rgb_ramp(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, alpha_offset;
   uint interpolate = node.z;
 
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
-  float4 color = rgb_ramp_lookup(kg, *offset, fac, interpolate, false, table_size);
+  float4 color = rgb_ramp_lookup(kg, offset, fac, interpolate, false, table_size);
 
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, float4_to_float3(color));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, color.w);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
-ccl_device void svm_node_curves(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_curves(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, out_offset;
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
   float3 color = stack_load_float3(stack, color_offset);
@@ -89,14 +94,15 @@ ccl_device void svm_node_curves(
   const float range_x = max_x - min_x;
   const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x;
 
-  float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x;
-  float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y;
-  float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z;
+  float r = rgb_ramp_lookup(kg, offset, relpos.x, true, true, table_size).x;
+  float g = rgb_ramp_lookup(kg, offset, relpos.y, true, true, table_size).y;
+  float b = rgb_ramp_lookup(kg, offset, relpos.z, true, true, table_size).z;
 
   color = (1.0f - fac) * color + fac * make_float3(r, g, b);
   stack_store_float3(stack, out_offset, color);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index f501252062e..8d52845ea3d 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -16,15 +16,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint hue_in,
-                                     uint saturation_in,
-                                     uint value_in,
-                                     int *offset)
+ccl_device_noinline int svm_node_combine_hsv(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint hue_in,
+                                             uint saturation_in,
+                                             uint value_in,
+                                             int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint color_out = node1.y;
 
   float hue = stack_load_float(stack, hue_in);
@@ -36,17 +36,18 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
 
   if (stack_valid(color_out))
     stack_store_float3(stack, color_out, color);
+  return offset;
 }
 
-ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint color_in,
-                                      uint hue_out,
-                                      uint saturation_out,
-                                      int *offset)
+ccl_device_noinline int svm_node_separate_hsv(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint color_in,
+                                              uint hue_out,
+                                              uint saturation_out,
+                                              int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint value_out = node1.y;
 
   float3 color = stack_load_float3(stack, color_in);
@@ -60,6 +61,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
     stack_store_float(stack, saturation_out, color.y);
   if (stack_valid(value_out))
     stack_store_float(stack, value_out, color.z);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index b908732f026..b77c4311e72 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,7 +37,7 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+ccl_device float3 sky_radiance_preetham(const KernelGlobals *kg,
                                         float3 dir,
                                         float sunphi,
                                         float suntheta,
@@ -90,7 +90,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
           configuration[6] * mieM + configuration[7] * zenith);
 }
 
-ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+ccl_device float3 sky_radiance_hosek(const KernelGlobals *kg,
                                      float3 dir,
                                      float sunphi,
                                      float suntheta,
@@ -127,7 +127,7 @@ ccl_device float3 geographical_to_direction(float lat, float lon)
   return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
 }
 
-ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+ccl_device float3 sky_radiance_nishita(const KernelGlobals *kg,
                                        float3 dir,
                                        float *nishita_data,
                                        uint texture_id)
@@ -209,8 +209,8 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
   return xyz_to_rgb(kg, xyz);
 }
 
-ccl_device void svm_node_tex_sky(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_sky(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   /* Load data */
   uint dir_offset = node.y;
@@ -226,49 +226,49 @@ ccl_device void svm_node_tex_sky(
     float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
     float config_x[9], config_y[9], config_z[9];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     sunphi = data.x;
     suntheta = data.y;
     radiance_x = data.z;
     radiance_y = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     radiance_z = data.x;
     config_x[0] = data.y;
     config_x[1] = data.z;
     config_x[2] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[3] = data.x;
     config_x[4] = data.y;
     config_x[5] = data.z;
     config_x[6] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[7] = data.x;
     config_x[8] = data.y;
     config_y[0] = data.z;
     config_y[1] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[2] = data.x;
     config_y[3] = data.y;
     config_y[4] = data.z;
     config_y[5] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[6] = data.x;
     config_y[7] = data.y;
     config_y[8] = data.z;
     config_z[0] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[1] = data.x;
     config_z[2] = data.y;
     config_z[3] = data.z;
     config_z[4] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[5] = data.x;
     config_z[6] = data.y;
     config_z[7] = data.z;
@@ -305,19 +305,19 @@ ccl_device void svm_node_tex_sky(
     /* Define variables */
     float nishita_data[10];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     nishita_data[0] = data.x;
     nishita_data[1] = data.y;
     nishita_data[2] = data.z;
     nishita_data[3] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[4] = data.x;
     nishita_data[5] = data.y;
     nishita_data[6] = data.z;
     nishita_data[7] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[8] = data.x;
     nishita_data[9] = data.y;
     uint texture_id = __float_as_uint(data.z);
@@ -327,6 +327,7 @@ ccl_device void svm_node_tex_sky(
   }
 
   stack_store_float3(stack, out_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 46600551cc4..a35253080da 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/geom/geom.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-ccl_device void svm_node_tex_coord(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
   float3 data;
   uint type = node.y;
@@ -35,9 +39,9 @@ ccl_device void svm_node_tex_coord(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -92,10 +96,11 @@ ccl_device void svm_node_tex_coord(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 }
 
-ccl_device void svm_node_tex_coord_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -112,9 +117,9 @@ ccl_device void svm_node_tex_coord_bump_dx(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -136,7 +141,7 @@ ccl_device void svm_node_tex_coord_bump_dx(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
       data.z = 0.0f;
@@ -169,13 +174,14 @@ ccl_device void svm_node_tex_coord_bump_dx(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_tex_coord_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -192,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -216,7 +222,7 @@ ccl_device void svm_node_tex_coord_bump_dy(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
       data.z = 0.0f;
@@ -249,12 +255,16 @@ ccl_device void svm_node_tex_coord_bump_dy(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_normal_map(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint4 node)
 {
   uint color_offset, strength_offset, normal_offset, space;
   svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
@@ -346,7 +356,10 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
   stack_store_float3(stack, normal_offset, N);
 }
 
-ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tangent(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint4 node)
 {
   uint tangent_offset, direction_type, axis;
   svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 062afcfa5ac..c053be96c51 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -30,37 +30,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
-/* Known frequencies of used nodes, used for selective nodes compilation
- * in the kernel. Currently only affects split OpenCL kernel.
- *
- * Keep as defines so it's easy to check which nodes are to be compiled
- * from preprocessor.
- *
- * Lower the number of group more often the node is used.
- */
-#define NODE_GROUP_LEVEL_0 0
-#define NODE_GROUP_LEVEL_1 1
-#define NODE_GROUP_LEVEL_2 2
-#define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_4 4
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
-
-#define NODE_FEATURE_VOLUME (1 << 0)
-#define NODE_FEATURE_HAIR (1 << 1)
-#define NODE_FEATURE_BUMP (1 << 2)
-#define NODE_FEATURE_BUMP_STATE (1 << 3)
-#define NODE_FEATURE_VORONOI_EXTRA (1 << 4)
-/* TODO(sergey): Consider using something like ((uint)(-1)).
- * Need to check carefully operand types around usage of this
- * define first.
- */
-#define NODE_FEATURE_ALL \
-  (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \
-   NODE_FEATURE_VORONOI_EXTRA)
-
-#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
-#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
-
 typedef enum ShaderNodeType {
   NODE_END = 0,
   NODE_SHADER_JUMP,
@@ -572,12 +541,8 @@ typedef enum ClosureType {
   CLOSURE_BSDF_TRANSPARENT_ID,
 
   /* BSSRDF */
-  CLOSURE_BSSRDF_CUBIC_ID,
-  CLOSURE_BSSRDF_GAUSSIAN_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_ID,
-  CLOSURE_BSSRDF_BURLEY_ID,
   CLOSURE_BSSRDF_RANDOM_WALK_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID,
+  CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID,
 
   /* Other */
   CLOSURE_HOLDOUT_ID,
@@ -620,11 +585,9 @@ typedef enum ClosureType {
    type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
-#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
-#define CLOSURE_IS_DISK_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
+  (type >= CLOSURE_BSSRDF_RANDOM_WALK_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_VOLUME(type) \
   (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
 #define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 5b76f2c8832..d0478660094 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -19,20 +19,21 @@ CCL_NAMESPACE_BEGIN
 /* Value Nodes */
 
 ccl_device void svm_node_value_f(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
 {
   stack_store_float(stack, out_offset, __uint_as_float(ivalue));
 }
 
-ccl_device void svm_node_value_v(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
+ccl_device int svm_node_value_v(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 p = make_float3(
       __uint_as_float(node1.y), __uint_as_float(node1.z), __uint_as_float(node1.w));
 
   stack_store_float3(stack, out_offset, p);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
index 50045752484..55e1bce0158 100644
--- a/intern/cycles/kernel/svm/svm_vector_rotate.h
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -18,11 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Rotate */
 
-ccl_device void svm_node_vector_rotate(ShaderData *sd,
-                                       float *stack,
-                                       uint input_stack_offsets,
-                                       uint axis_stack_offsets,
-                                       uint result_stack_offset)
+ccl_device_noinline void svm_node_vector_rotate(ShaderData *sd,
+                                                float *stack,
+                                                uint input_stack_offsets,
+                                                uint axis_stack_offsets,
+                                                uint result_stack_offset)
 {
   uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
       angle_stack_offset, invert;
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 1e95492cf1b..8aedb7e0f54 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Transform */
 
-ccl_device void svm_node_vector_transform(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          float *stack,
-                                          uint4 node)
+ccl_device_noinline void svm_node_vector_transform(const KernelGlobals *kg,
+                                                   ShaderData *sd,
+                                                   float *stack,
+                                                   uint4 node)
 {
   uint itype, ifrom, ito;
   uint vector_in, vector_out;
diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h
index 0aa45835522..986ea244f3a 100644
--- a/intern/cycles/kernel/svm/svm_vertex_color.h
+++ b/intern/cycles/kernel/svm/svm_vertex_color.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_vertex_color(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint layer_id,
-                                      uint color_offset,
-                                      uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint layer_id,
+                                               uint color_offset,
+                                               uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -35,18 +35,12 @@ ccl_device void svm_node_vertex_color(KernelGlobals *kg,
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dx(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dx(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -62,18 +56,12 @@ ccl_device_noinline
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dy(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dy(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index d0e7db35fab..b1d2eff7f37 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -902,16 +902,17 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float
   *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
 }
 
-ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint dimensions,
-                                     uint feature,
-                                     uint metric,
-                                     int *offset)
+template<uint node_feature_mask>
+ccl_device_noinline int svm_node_tex_voronoi(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint dimensions,
+                                             uint feature,
+                                             uint metric,
+                                             int offset)
 {
-  uint4 stack_offsets = read_node(kg, offset);
-  uint4 defaults = read_node(kg, offset);
+  uint4 stack_offsets = read_node(kg, &offset);
+  uint4 defaults = read_node(kg, &offset);
 
   uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset;
   uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset,
@@ -997,18 +998,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out_2d);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_2d(coord_2d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_2d);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_2d(coord_2d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_2d);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_2d(coord_2d,
                         exponent,
@@ -1042,18 +1043,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_3d(coord,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_3d(coord,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_3d(coord,
                         exponent,
@@ -1076,54 +1077,54 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
       break;
     }
 
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
     case 4: {
-      float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
-      float4 position_out_4d;
-      switch (voronoi_feature) {
-        case NODE_VORONOI_F1:
-          voronoi_f1_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_4d(coord_4d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_4d);
-          break;
-        case NODE_VORONOI_F2:
-          voronoi_f2_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_DISTANCE_TO_EDGE:
-          voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
-          break;
-        case NODE_VORONOI_N_SPHERE_RADIUS:
-          voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
-          break;
-        default:
-          kernel_assert(0);
+      if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+        float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
+        float4 position_out_4d;
+        switch (voronoi_feature) {
+          case NODE_VORONOI_F1:
+            voronoi_f1_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_SMOOTH_F1:
+            voronoi_smooth_f1_4d(coord_4d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_4d);
+            break;
+          case NODE_VORONOI_F2:
+            voronoi_f2_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_DISTANCE_TO_EDGE:
+            voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
+            break;
+          case NODE_VORONOI_N_SPHERE_RADIUS:
+            voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
+            break;
+          default:
+            kernel_assert(0);
+        }
+        position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+        position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
+        w_out = position_out_4d.w;
       }
-      position_out_4d = safe_divide_float4_float(position_out_4d, scale);
-      position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
-      w_out = position_out_4d.w;
       break;
     }
-#endif
     default:
       kernel_assert(0);
   }
@@ -1138,6 +1139,7 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
     stack_store_float(stack, w_out_stack_offset, w_out);
   if (stack_valid(radius_out_stack_offset))
     stack_store_float(stack, radius_out_stack_offset, radius_out);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 4bc14f82382..78b75405356 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -19,8 +19,8 @@ CCL_NAMESPACE_BEGIN
 /* TODO(sergey): Think of making it more generic volume-type attribute
  * sampler.
  */
-ccl_device void svm_node_tex_voxel(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_voxel(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, density_out_offset, color_out_offset, space;
   svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
@@ -33,9 +33,9 @@ ccl_device void svm_node_tex_voxel(
   else {
     kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
     Transform tfm;
-    tfm.x = read_node_float(kg, offset);
-    tfm.y = read_node_float(kg, offset);
-    tfm.z = read_node_float(kg, offset);
+    tfm.x = read_node_float(kg, &offset);
+    tfm.y = read_node_float(kg, &offset);
+    tfm.z = read_node_float(kg, &offset);
     co = transform_point(&tfm, co);
   }
 
@@ -47,6 +47,7 @@ ccl_device void svm_node_tex_voxel(
     stack_store_float(stack, density_out_offset, r.w);
   if (stack_valid(color_out_offset))
     stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index c4763475b47..00f980c16df 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -82,11 +82,11 @@ ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
   }
 }
 
-ccl_device void svm_node_tex_wave(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_wave(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
 
   /* RNA properties */
   uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
@@ -125,6 +125,7 @@ ccl_device void svm_node_tex_wave(
     stack_store_float(stack, fac_offset, f);
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, make_float3(f, f, f));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index d6144802559..fba8aa63d31 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -69,8 +69,8 @@ ccl_static_constant float cie_colour_match[81][3] = {
     {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
     {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
 
-ccl_device void svm_node_wavelength(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
+ccl_device_noinline void svm_node_wavelength(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {
   float lambda_nm = stack_load_float(stack, wavelength);
   float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f);  // scaled 0..80
diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h
index b30d85acaec..0306d2e7b9c 100644
--- a/intern/cycles/kernel/svm/svm_white_noise.h
+++ b/intern/cycles/kernel/svm/svm_white_noise.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_tex_white_noise(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint dimensions,
-                                         uint inputs_stack_offsets,
-                                         uint ouptuts_stack_offsets,
-                                         int *offset)
+ccl_device_noinline void svm_node_tex_white_noise(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint dimensions,
+                                                  uint inputs_stack_offsets,
+                                                  uint ouptuts_stack_offsets)
 {
   uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset;
   svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 49158bd86d5..7ec913789d2 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -35,7 +35,7 @@ CCL_NAMESPACE_BEGIN
 /* Wireframe Node */
 
 ccl_device_inline float wireframe(
-    KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
+    const KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
 {
 #ifdef __HAIR__
   if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
@@ -88,7 +88,10 @@ ccl_device_inline float wireframe(
   return 0.0f;
 }
 
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_wireframe(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint4 node)
 {
   uint in_size = node.y;
   uint out_fac = node.z;
@@ -100,18 +103,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
   int pixel_size = (int)use_pixel_size;
 
   /* Calculate wireframe */
-#ifdef __SPLIT_KERNEL__
-  /* TODO(sergey): This is because sd is actually a global space,
-   * which makes it difficult to re-use same wireframe() function.
-   *
-   * With OpenCL 2.0 it's possible to avoid this change, but for until
-   * then we'll be living with such an exception.
-   */
-  float3 P = sd->P;
-  float f = wireframe(kg, sd, size, pixel_size, &P);
-#else
   float f = wireframe(kg, sd, size, pixel_size, &sd->P);
-#endif
 
   /* TODO(sergey): Think of faster way to calculate derivatives. */
   if (bump_offset == NODE_BUMP_OFFSET_DX) {
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index feead27c5ca..6edb5261b32 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -32,10 +32,10 @@ set(SRC
   camera.cpp
   colorspace.cpp
   constant_fold.cpp
-  coverage.cpp
   denoising.cpp
   film.cpp
   geometry.cpp
+  gpu_display.cpp
   graph.cpp
   hair.cpp
   image.cpp
@@ -54,6 +54,7 @@ set(SRC
   object.cpp
   osl.cpp
   particles.cpp
+  pass.cpp
   curves.cpp
   scene.cpp
   session.cpp
@@ -76,10 +77,10 @@ set(SRC_HEADERS
   camera.h
   colorspace.h
   constant_fold.h
-  coverage.h
   denoising.h
   film.h
   geometry.h
+  gpu_display.h
   graph.h
   hair.h
   image.h
@@ -95,6 +96,7 @@ set(SRC_HEADERS
   object.h
   osl.h
   particles.h
+  pass.h
   procedural.h
   curves.h
   scene.h
@@ -111,6 +113,7 @@ set(SRC_HEADERS
 set(LIB
   cycles_bvh
   cycles_device
+  cycles_integrator
   cycles_subd
   cycles_util
 )
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index b925e755434..ae6290ac27b 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -34,11 +34,7 @@ NODE_DEFINE(Background)
 {
   NodeType *type = NodeType::add("background", create);
 
-  SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
-  SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
-
   SOCKET_BOOLEAN(use_shader, "Use Shader", true);
-  SOCKET_BOOLEAN(use_ao, "Use AO", false);
   SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY);
 
   SOCKET_BOOLEAN(transparent, "Transparent", false);
@@ -80,10 +76,6 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
   /* set shader index and transparent option */
   KernelBackground *kbackground = &dscene->data.background;
 
-  kbackground->ao_factor = (use_ao) ? ao_factor : 0.0f;
-  kbackground->ao_bounces_factor = ao_factor;
-  kbackground->ao_distance = ao_distance;
-
   kbackground->transparent = transparent;
   kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader);
 
@@ -138,10 +130,6 @@ void Background::tag_update(Scene *scene)
      * and to avoid doing unnecessary updates anywhere else. */
     tag_use_shader_modified();
   }
-
-  if (ao_factor_is_modified() || use_ao_is_modified()) {
-    scene->integrator->tag_update(scene, Integrator::BACKGROUND_AO_MODIFIED);
-  }
 }
 
 Shader *Background::get_shader(const Scene *scene)
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index e89ffbc2445..2f7ef0f7737 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -32,11 +32,7 @@ class Background : public Node {
  public:
   NODE_DECLARE
 
-  NODE_SOCKET_API(float, ao_factor)
-  NODE_SOCKET_API(float, ao_distance)
-
   NODE_SOCKET_API(bool, use_shader)
-  NODE_SOCKET_API(bool, use_ao)
 
   NODE_SOCKET_API(uint, visibility)
   NODE_SOCKET_API(Shader *, shader)
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 317a3937cab..54e496caed6 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -26,58 +26,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-static int aa_samples(Scene *scene, Object *object, ShaderEvalType type)
-{
-  if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
-    return 1;
-  }
-  else if (type == SHADER_EVAL_NORMAL) {
-    /* Only antialias normal if mesh has bump mapping. */
-    if (object->get_geometry()) {
-      foreach (Node *node, object->get_geometry()->get_used_shaders()) {
-        Shader *shader = static_cast<Shader *>(node);
-        if (shader->has_bump) {
-          return scene->integrator->get_aa_samples();
-        }
-      }
-    }
-
-    return 1;
-  }
-  else {
-    return scene->integrator->get_aa_samples();
-  }
-}
-
-/* Keep it synced with kernel_bake.h logic */
-static int shader_type_to_pass_filter(ShaderEvalType type, int pass_filter)
-{
-  const int component_flags = pass_filter &
-                              (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR);
-
-  switch (type) {
-    case SHADER_EVAL_AO:
-      return BAKE_FILTER_AO;
-    case SHADER_EVAL_SHADOW:
-      return BAKE_FILTER_DIRECT;
-    case SHADER_EVAL_DIFFUSE:
-      return BAKE_FILTER_DIFFUSE | component_flags;
-    case SHADER_EVAL_GLOSSY:
-      return BAKE_FILTER_GLOSSY | component_flags;
-    case SHADER_EVAL_TRANSMISSION:
-      return BAKE_FILTER_TRANSMISSION | component_flags;
-    case SHADER_EVAL_COMBINED:
-      return pass_filter;
-    default:
-      return 0;
-  }
-}
-
 BakeManager::BakeManager()
 {
-  type = SHADER_EVAL_BAKE;
-  pass_filter = 0;
-
   need_update_ = true;
 }
 
@@ -85,32 +35,14 @@ BakeManager::~BakeManager()
 {
 }
 
-bool BakeManager::get_baking()
+bool BakeManager::get_baking() const
 {
   return !object_name.empty();
 }
 
-void BakeManager::set(Scene *scene,
-                      const std::string &object_name_,
-                      ShaderEvalType type_,
-                      int pass_filter_)
+void BakeManager::set(Scene *scene, const std::string &object_name_)
 {
   object_name = object_name_;
-  type = type_;
-  pass_filter = shader_type_to_pass_filter(type_, pass_filter_);
-
-  Pass::add(PASS_BAKE_PRIMITIVE, scene->passes);
-  Pass::add(PASS_BAKE_DIFFERENTIAL, scene->passes);
-
-  if (type == SHADER_EVAL_UV) {
-    /* force UV to be available */
-    Pass::add(PASS_UV, scene->passes);
-  }
-
-  /* force use_light_pass to be true if we bake more than just colors */
-  if (pass_filter & ~BAKE_FILTER_COLOR) {
-    Pass::add(PASS_LIGHT, scene->passes);
-  }
 
   /* create device and update scene */
   scene->film->tag_modified();
@@ -127,29 +59,29 @@ void BakeManager::device_update(Device * /*device*/,
   if (!need_update())
     return;
 
-  scoped_callback_timer timer([scene](double time) {
-    if (scene->update_stats) {
-      scene->update_stats->bake.times.add_entry({"device_update", time});
-    }
-  });
-
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
   KernelBake *kbake = &dscene->data.bake;
+  memset(kbake, 0, sizeof(*kbake));
 
-  kbake->type = type;
-  kbake->pass_filter = pass_filter;
-
-  int object_index = 0;
-  foreach (Object *object, scene->objects) {
-    const Geometry *geom = object->get_geometry();
-    if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
-      kbake->object_index = object_index;
-      kbake->tri_offset = geom->prim_offset;
-      kintegrator->aa_samples = aa_samples(scene, object, type);
-      break;
-    }
+  if (!object_name.empty()) {
+    scoped_callback_timer timer([scene](double time) {
+      if (scene->update_stats) {
+        scene->update_stats->bake.times.add_entry({"device_update", time});
+      }
+    });
+
+    kbake->use = true;
 
-    object_index++;
+    int object_index = 0;
+    foreach (Object *object, scene->objects) {
+      const Geometry *geom = object->get_geometry();
+      if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
+        kbake->object_index = object_index;
+        kbake->tri_offset = geom->prim_offset;
+        break;
+      }
+
+      object_index++;
+    }
   }
 
   need_update_ = false;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 655b9b1cf7e..39e504490c2 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -30,8 +30,8 @@ class BakeManager {
   BakeManager();
   ~BakeManager();
 
-  void set(Scene *scene, const std::string &object_name, ShaderEvalType type, int pass_filter);
-  bool get_baking();
+  void set(Scene *scene, const std::string &object_name);
+  bool get_baking() const;
 
   void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
   void device_free(Device *device, DeviceScene *dscene);
@@ -42,8 +42,6 @@ class BakeManager {
 
  private:
   bool need_update_;
-  ShaderEvalType type;
-  int pass_filter;
   std::string object_name;
 };
 
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fcfad58995e..1cdae3af7f5 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -28,537 +28,334 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Buffer Params */
+/* --------------------------------------------------------------------
+ * Convert part information to an index of `BufferParams::pass_offset_`.
+ */
 
-BufferParams::BufferParams()
+static int pass_type_mode_to_index(PassType pass_type, PassMode mode)
 {
-  width = 0;
-  height = 0;
-
-  full_x = 0;
-  full_y = 0;
-  full_width = 0;
-  full_height = 0;
+  int index = static_cast<int>(pass_type) * 2;
 
-  denoising_data_pass = false;
-  denoising_clean_pass = false;
-  denoising_prefiltered_pass = false;
+  if (mode == PassMode::DENOISED) {
+    ++index;
+  }
 
-  Pass::add(PASS_COMBINED, passes);
+  return index;
 }
 
-void BufferParams::get_offset_stride(int &offset, int &stride)
+static int pass_to_index(const BufferPass &pass)
 {
-  offset = -(full_x + full_y * width);
-  stride = width;
+  return pass_type_mode_to_index(pass.type, pass.mode);
 }
 
-bool BufferParams::modified(const BufferParams &params)
-{
-  return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
-           height == params.height && full_width == params.full_width &&
-           full_height == params.full_height && Pass::equals(passes, params.passes) &&
-           denoising_data_pass == params.denoising_data_pass &&
-           denoising_clean_pass == params.denoising_clean_pass &&
-           denoising_prefiltered_pass == params.denoising_prefiltered_pass);
-}
+/* --------------------------------------------------------------------
+ * Buffer pass.
+ */
 
-int BufferParams::get_passes_size()
+NODE_DEFINE(BufferPass)
 {
-  int size = 0;
+  NodeType *type = NodeType::add("buffer_pass", create);
 
-  for (size_t i = 0; i < passes.size(); i++)
-    size += passes[i].components;
+  const NodeEnum *pass_type_enum = Pass::get_type_enum();
+  const NodeEnum *pass_mode_enum = Pass::get_mode_enum();
 
-  if (denoising_data_pass) {
-    size += DENOISING_PASS_SIZE_BASE;
-    if (denoising_clean_pass)
-      size += DENOISING_PASS_SIZE_CLEAN;
-    if (denoising_prefiltered_pass)
-      size += DENOISING_PASS_SIZE_PREFILTERED;
-  }
+  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+  SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+  SOCKET_STRING(name, "Name", ustring());
+  SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
 
-  return align_up(size, 4);
-}
+  SOCKET_INT(offset, "Offset", -1);
 
-int BufferParams::get_denoising_offset()
-{
-  int offset = 0;
-
-  for (size_t i = 0; i < passes.size(); i++)
-    offset += passes[i].components;
-
-  return offset;
+  return type;
 }
 
-int BufferParams::get_denoising_prefiltered_offset()
+BufferPass::BufferPass() : Node(get_node_type())
 {
-  assert(denoising_prefiltered_pass);
-
-  int offset = get_denoising_offset();
-
-  offset += DENOISING_PASS_SIZE_BASE;
-  if (denoising_clean_pass) {
-    offset += DENOISING_PASS_SIZE_CLEAN;
-  }
-
-  return offset;
 }
 
-/* Render Buffer Task */
-
-RenderTile::RenderTile()
+BufferPass::BufferPass(const Pass *scene_pass)
+    : Node(get_node_type()),
+      type(scene_pass->get_type()),
+      mode(scene_pass->get_mode()),
+      name(scene_pass->get_name()),
+      include_albedo(scene_pass->get_include_albedo())
 {
-  x = 0;
-  y = 0;
-  w = 0;
-  h = 0;
-
-  sample = 0;
-  start_sample = 0;
-  num_samples = 0;
-  resolution = 0;
-
-  offset = 0;
-  stride = 0;
-
-  buffer = 0;
-
-  buffers = NULL;
-  stealing_state = NO_STEALING;
 }
 
-/* Render Buffers */
-
-RenderBuffers::RenderBuffers(Device *device)
-    : buffer(device, "RenderBuffers", MEM_READ_WRITE),
-      map_neighbor_copied(false),
-      render_time(0.0f)
+PassInfo BufferPass::get_info() const
 {
+  return Pass::get_info(type, include_albedo);
 }
 
-RenderBuffers::~RenderBuffers()
-{
-  buffer.free();
-}
+/* --------------------------------------------------------------------
+ * Buffer Params.
+ */
 
-void RenderBuffers::reset(BufferParams &params_)
+NODE_DEFINE(BufferParams)
 {
-  params = params_;
-
-  /* re-allocate buffer */
-  buffer.alloc(params.width * params.get_passes_size(), params.height);
-  buffer.zero_to_device();
+  NodeType *type = NodeType::add("buffer_params", create);
+
+  SOCKET_INT(width, "Width", 0);
+  SOCKET_INT(height, "Height", 0);
+
+  SOCKET_INT(full_x, "Full X", 0);
+  SOCKET_INT(full_y, "Full Y", 0);
+  SOCKET_INT(full_width, "Full Width", 0);
+  SOCKET_INT(full_height, "Full Height", 0);
+
+  SOCKET_STRING(layer, "Layer", ustring());
+  SOCKET_STRING(view, "View", ustring());
+  SOCKET_FLOAT(exposure, "Exposure", 1.0f);
+  SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+  SOCKET_BOOLEAN(use_transparent_background, "Transparent Background", false);
+
+  /* Notes:
+   *  - Skip passes since they do not follow typical container socket definition.
+   *    Might look into covering those as a socket in the future.
+   *
+   *  - Skip offset, stride, and pass stride since those can be delivered from the passes and
+   *    rest of the sockets. */
+
+  return type;
 }
 
-void RenderBuffers::zero()
+BufferParams::BufferParams() : Node(get_node_type())
 {
-  buffer.zero_to_device();
+  reset_pass_offset();
 }
 
-bool RenderBuffers::copy_from_device()
+void BufferParams::update_passes()
 {
-  if (!buffer.device_pointer)
-    return false;
-
-  buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height);
-
-  return true;
-}
-
-bool RenderBuffers::get_denoising_pass_rect(
-    int type, float exposure, int sample, int components, float *pixels)
-{
-  if (buffer.data() == NULL) {
-    return false;
-  }
-
-  float scale = 1.0f;
-  float alpha_scale = 1.0f / sample;
-  if (type == DENOISING_PASS_PREFILTERED_COLOR || type == DENOISING_PASS_CLEAN ||
-      type == DENOISING_PASS_PREFILTERED_INTENSITY) {
-    scale *= exposure;
-  }
-  else if (type == DENOISING_PASS_PREFILTERED_VARIANCE) {
-    scale *= exposure * exposure * (sample - 1);
-  }
+  update_offset_stride();
+  reset_pass_offset();
+
+  pass_stride = 0;
+  for (const BufferPass &pass : passes) {
+    if (pass.offset != PASS_UNUSED) {
+      const int index = pass_to_index(pass);
+      if (pass_offset_[index] == PASS_UNUSED) {
+        pass_offset_[index] = pass_stride;
+      }
 
-  int offset;
-  if (type == DENOISING_PASS_CLEAN) {
-    /* The clean pass isn't changed by prefiltering, so we use the original one there. */
-    offset = type + params.get_denoising_offset();
-    scale /= sample;
-  }
-  else if (params.denoising_prefiltered_pass) {
-    offset = type + params.get_denoising_prefiltered_offset();
-  }
-  else {
-    switch (type) {
-      case DENOISING_PASS_PREFILTERED_DEPTH:
-        offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH;
-        break;
-      case DENOISING_PASS_PREFILTERED_NORMAL:
-        offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL;
-        break;
-      case DENOISING_PASS_PREFILTERED_ALBEDO:
-        offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO;
-        break;
-      case DENOISING_PASS_PREFILTERED_COLOR:
-        /* If we're not saving the prefiltering result, return the original noisy pass. */
-        offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
-        break;
-      default:
-        return false;
+      pass_stride += pass.get_info().num_components;
     }
-    scale /= sample;
   }
+}
 
-  int pass_stride = params.get_passes_size();
-  int size = params.width * params.height;
+void BufferParams::update_passes(const vector<Pass *> &scene_passes)
+{
+  passes.clear();
 
-  float *in = buffer.data() + offset;
+  pass_stride = 0;
+  for (const Pass *scene_pass : scene_passes) {
+    BufferPass buffer_pass(scene_pass);
 
-  if (components == 1) {
-    for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-      pixels[0] = in[0] * scale;
+    if (scene_pass->is_written()) {
+      buffer_pass.offset = pass_stride;
+      pass_stride += scene_pass->get_info().num_components;
     }
-  }
-  else if (components == 3) {
-    for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-      pixels[0] = in[0] * scale;
-      pixels[1] = in[1] * scale;
-      pixels[2] = in[2] * scale;
-    }
-  }
-  else if (components == 4) {
-    /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
-    assert(params.passes[0].type == PASS_COMBINED);
-    float *in_combined = buffer.data();
-
-    for (int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
-      float3 val = make_float3(in[0], in[1], in[2]);
-      if (type == DENOISING_PASS_PREFILTERED_COLOR && params.denoising_prefiltered_pass) {
-        /* Remove highlight compression from the image. */
-        val = color_highlight_uncompress(val);
-      }
-      pixels[0] = val.x * scale;
-      pixels[1] = val.y * scale;
-      pixels[2] = val.z * scale;
-      pixels[3] = saturate(in_combined[3] * alpha_scale);
+    else {
+      buffer_pass.offset = PASS_UNUSED;
     }
-  }
-  else {
-    return false;
+
+    passes.emplace_back(std::move(buffer_pass));
   }
 
-  return true;
+  update_passes();
 }
 
-bool RenderBuffers::get_pass_rect(
-    const string &name, float exposure, int sample, int components, float *pixels)
+void BufferParams::reset_pass_offset()
 {
-  if (buffer.data() == NULL) {
-    return false;
+  for (int i = 0; i < kNumPassOffsets; ++i) {
+    pass_offset_[i] = PASS_UNUSED;
   }
+}
 
-  float *sample_count = NULL;
-  if (name == "Combined") {
-    int sample_offset = 0;
-    for (size_t j = 0; j < params.passes.size(); j++) {
-      Pass &pass = params.passes[j];
-      if (pass.type != PASS_SAMPLE_COUNT) {
-        sample_offset += pass.components;
-        continue;
-      }
-      else {
-        sample_count = buffer.data() + sample_offset;
-        break;
-      }
-    }
+int BufferParams::get_pass_offset(PassType pass_type, PassMode mode) const
+{
+  if (pass_type == PASS_NONE || pass_type == PASS_UNUSED) {
+    return PASS_UNUSED;
   }
 
-  int pass_offset = 0;
-
-  for (size_t j = 0; j < params.passes.size(); j++) {
-    Pass &pass = params.passes[j];
+  const int index = pass_type_mode_to_index(pass_type, mode);
+  return pass_offset_[index];
+}
 
-    /* Pass is identified by both type and name, multiple of the same type
-     * may exist with a different name. */
-    if (pass.name != name) {
-      pass_offset += pass.components;
-      continue;
+const BufferPass *BufferParams::find_pass(string_view name) const
+{
+  for (const BufferPass &pass : passes) {
+    if (pass.name == name) {
+      return &pass;
     }
+  }
 
-    PassType type = pass.type;
-
-    float *in = buffer.data() + pass_offset;
-    int pass_stride = params.get_passes_size();
-
-    float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f;
-    float scale_exposure = (pass.exposure) ? scale * exposure : scale;
-
-    int size = params.width * params.height;
+  return nullptr;
+}
 
-    if (components == 1 && type == PASS_RENDER_TIME) {
-      /* Render time is not stored by kernel, but measured per tile. */
-      float val = (float)(1000.0 * render_time / (params.width * params.height * sample));
-      for (int i = 0; i < size; i++, pixels++) {
-        pixels[0] = val;
-      }
-    }
-    else if (components == 1) {
-      assert(pass.components == components);
-
-      /* Scalar */
-      if (type == PASS_DEPTH) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
-        }
-      }
-      else if (type == PASS_MIST) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = saturate(f * scale_exposure);
-        }
-      }
-      else {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = f * scale_exposure;
-        }
-      }
-    }
-    else if (components == 3) {
-      assert(pass.components == 4);
-
-      /* RGBA */
-      if (type == PASS_SHADOW) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-        }
-      }
-      else if (pass.divide_type != PASS_NONE) {
-        /* RGB lighting passes that need to divide out color */
-        pass_offset = 0;
-        for (size_t k = 0; k < params.passes.size(); k++) {
-          Pass &color_pass = params.passes[k];
-          if (color_pass.type == pass.divide_type)
-            break;
-          pass_offset += color_pass.components;
-        }
-
-        float *in_divide = buffer.data() + pass_offset;
-
-        for (int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
-          float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
-
-          f = safe_divide_even_color(f * exposure, f_divide);
-
-          pixels[0] = f.x;
-          pixels[1] = f.y;
-          pixels[2] = f.z;
-        }
-      }
-      else {
-        /* RGB/vector */
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
-
-          pixels[0] = f.x * scale_exposure;
-          pixels[1] = f.y * scale_exposure;
-          pixels[2] = f.z * scale_exposure;
-        }
-      }
-    }
-    else if (components == 4) {
-      assert(pass.components == components);
-
-      /* RGBA */
-      if (type == PASS_SHADOW) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-          pixels[3] = 1.0f;
-        }
-      }
-      else if (type == PASS_MOTION) {
-        /* need to normalize by number of samples accumulated for motion */
-        pass_offset = 0;
-        for (size_t k = 0; k < params.passes.size(); k++) {
-          Pass &color_pass = params.passes[k];
-          if (color_pass.type == PASS_MOTION_WEIGHT)
-            break;
-          pass_offset += color_pass.components;
-        }
-
-        float *in_weight = buffer.data() + pass_offset;
-
-        for (int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float w = in_weight[0];
-          float invw = (w > 0.0f) ? 1.0f / w : 0.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-          pixels[3] = f.w * invw;
-        }
-      }
-      else if (type == PASS_CRYPTOMATTE) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          /* x and z contain integer IDs, don't rescale them.
-             y and w contain matte weights, they get scaled. */
-          pixels[0] = f.x;
-          pixels[1] = f.y * scale;
-          pixels[2] = f.z;
-          pixels[3] = f.w * scale;
-        }
-      }
-      else {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
-            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
-            scale_exposure = (pass.exposure) ? scale * exposure : scale;
-          }
-
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-
-          pixels[0] = f.x * scale_exposure;
-          pixels[1] = f.y * scale_exposure;
-          pixels[2] = f.z * scale_exposure;
-
-          /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-          pixels[3] = saturate(f.w * scale);
-        }
-      }
+const BufferPass *BufferParams::find_pass(PassType type, PassMode mode) const
+{
+  for (const BufferPass &pass : passes) {
+    if (pass.type == type && pass.mode == mode) {
+      return &pass;
     }
-
-    return true;
   }
 
-  return false;
+  return nullptr;
 }
 
-bool RenderBuffers::set_pass_rect(PassType type, int components, float *pixels, int samples)
+const BufferPass *BufferParams::get_actual_display_pass(PassType type, PassMode mode) const
 {
-  if (buffer.data() == NULL) {
-    return false;
-  }
-
-  int pass_offset = 0;
+  const BufferPass *pass = find_pass(type, mode);
+  return get_actual_display_pass(pass);
+}
 
-  for (size_t j = 0; j < params.passes.size(); j++) {
-    Pass &pass = params.passes[j];
+const BufferPass *BufferParams::get_actual_display_pass(const BufferPass *pass) const
+{
+  if (!pass) {
+    return nullptr;
+  }
 
-    if (pass.type != type) {
-      pass_offset += pass.components;
-      continue;
+  if (pass->type == PASS_COMBINED) {
+    const BufferPass *shadow_catcher_matte_pass = find_pass(PASS_SHADOW_CATCHER_MATTE, pass->mode);
+    if (shadow_catcher_matte_pass) {
+      pass = shadow_catcher_matte_pass;
     }
+  }
 
-    float *out = buffer.data() + pass_offset;
-    int pass_stride = params.get_passes_size();
-    int size = params.width * params.height;
-
-    assert(pass.components == components);
+  return pass;
+}
 
-    for (int i = 0; i < size; i++, out += pass_stride, pixels += components) {
-      if (pass.filter) {
-        /* Scale by the number of samples, inverse of what we do in get_pass_rect.
-         * A better solution would be to remove the need for set_pass_rect entirely,
-         * and change baking to bake multiple objects in a tile at once. */
-        for (int j = 0; j < components; j++) {
-          out[j] = pixels[j] * samples;
-        }
-      }
-      else {
-        /* For non-filtered passes just straight copy, these may contain non-float data. */
-        memcpy(out, pixels, sizeof(float) * components);
-      }
-    }
+void BufferParams::update_offset_stride()
+{
+  offset = -(full_x + full_y * width);
+  stride = width;
+}
 
+bool BufferParams::modified(const BufferParams &other) const
+{
+  if (!(width == other.width && height == other.height && full_x == other.full_x &&
+        full_y == other.full_y && full_width == other.full_width &&
+        full_height == other.full_height && offset == other.offset && stride == other.stride &&
+        pass_stride == other.pass_stride && layer == other.layer && view == other.view &&
+        exposure == other.exposure &&
+        use_approximate_shadow_catcher == other.use_approximate_shadow_catcher &&
+        use_transparent_background == other.use_transparent_background)) {
     return true;
   }
 
-  return false;
+  return !(passes == other.passes);
 }
 
-/* Display Buffer */
+/* --------------------------------------------------------------------
+ * Render Buffers.
+ */
 
-DisplayBuffer::DisplayBuffer(Device *device, bool linear)
-    : draw_width(0),
-      draw_height(0),
-      transparent(true), /* todo: determine from background */
-      half_float(linear),
-      rgba_byte(device, "display buffer byte"),
-      rgba_half(device, "display buffer half")
+RenderBuffers::RenderBuffers(Device *device) : buffer(device, "RenderBuffers", MEM_READ_WRITE)
 {
 }
 
-DisplayBuffer::~DisplayBuffer()
+RenderBuffers::~RenderBuffers()
 {
-  rgba_byte.free();
-  rgba_half.free();
+  buffer.free();
 }
 
-void DisplayBuffer::reset(BufferParams &params_)
+void RenderBuffers::reset(const BufferParams &params_)
 {
-  draw_width = 0;
-  draw_height = 0;
+  DCHECK(params_.pass_stride != -1);
 
   params = params_;
 
-  /* allocate display pixels */
-  if (half_float) {
-    rgba_half.alloc_to_device(params.width, params.height);
-  }
-  else {
-    rgba_byte.alloc_to_device(params.width, params.height);
-  }
+  /* re-allocate buffer */
+  buffer.alloc(params.width * params.pass_stride, params.height);
 }
 
-void DisplayBuffer::draw_set(int width, int height)
+void RenderBuffers::zero()
 {
-  assert(width <= params.width && height <= params.height);
+  buffer.zero_to_device();
+}
 
-  draw_width = width;
-  draw_height = height;
+bool RenderBuffers::copy_from_device()
+{
+  DCHECK(params.pass_stride != -1);
+
+  if (!buffer.device_pointer)
+    return false;
+
+  buffer.copy_from_device(0, params.width * params.pass_stride, params.height);
+
+  return true;
 }
 
-void DisplayBuffer::draw(Device *device, const DeviceDrawParams &draw_params)
+void RenderBuffers::copy_to_device()
 {
-  if (draw_width != 0 && draw_height != 0) {
-    device_memory &rgba = (half_float) ? (device_memory &)rgba_half : (device_memory &)rgba_byte;
-
-    device->draw_pixels(rgba,
-                        0,
-                        draw_width,
-                        draw_height,
-                        params.width,
-                        params.height,
-                        params.full_x,
-                        params.full_y,
-                        params.full_width,
-                        params.full_height,
-                        transparent,
-                        draw_params);
-  }
+  buffer.copy_to_device();
 }
 
-bool DisplayBuffer::draw_ready()
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+                                       const BufferParams &dst_params,
+                                       const RenderBuffers *src,
+                                       const BufferParams &src_params,
+                                       const size_t src_offset)
 {
-  return (draw_width != 0 && draw_height != 0);
+  DCHECK_EQ(dst_params.width, src_params.width);
+  /* TODO(sergey): More sanity checks to avoid buffer overrun. */
+
+  /* Create a map of pass ofsets to be copied.
+   * Assume offsets are different to allow copying passes between buffers with different set of
+   * passes. */
+
+  struct {
+    int dst_offset;
+    int src_offset;
+  } pass_offsets[PASS_NUM];
+
+  int num_passes = 0;
+
+  for (int i = 0; i < PASS_NUM; ++i) {
+    const PassType pass_type = static_cast<PassType>(i);
+
+    const int dst_pass_offset = dst_params.get_pass_offset(pass_type, PassMode::DENOISED);
+    if (dst_pass_offset == PASS_UNUSED) {
+      continue;
+    }
+
+    const int src_pass_offset = src_params.get_pass_offset(pass_type, PassMode::DENOISED);
+    if (src_pass_offset == PASS_UNUSED) {
+      continue;
+    }
+
+    pass_offsets[num_passes].dst_offset = dst_pass_offset;
+    pass_offsets[num_passes].src_offset = src_pass_offset;
+    ++num_passes;
+  }
+
+  /* Copy passes. */
+  /* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */
+
+  const int64_t dst_width = dst_params.width;
+  const int64_t dst_height = dst_params.height;
+  const int64_t dst_pass_stride = dst_params.pass_stride;
+  const int64_t dst_num_pixels = dst_width * dst_height;
+
+  const int64_t src_pass_stride = src_params.pass_stride;
+  const int64_t src_offset_in_floats = src_offset * src_pass_stride;
+
+  const float *src_pixel = src->buffer.data() + src_offset_in_floats;
+  float *dst_pixel = dst->buffer.data();
+
+  for (int i = 0; i < dst_num_pixels;
+       ++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride) {
+    for (int pass_offset_idx = 0; pass_offset_idx < num_passes; ++pass_offset_idx) {
+      const int dst_pass_offset = pass_offsets[pass_offset_idx].dst_offset;
+      const int src_pass_offset = pass_offsets[pass_offset_idx].src_offset;
+
+      /* TODO(sergey): Support non-RGBA passes. */
+      dst_pixel[dst_pass_offset + 0] = src_pixel[src_pass_offset + 0];
+      dst_pixel[dst_pass_offset + 1] = src_pixel[src_pass_offset + 1];
+      dst_pixel[dst_pass_offset + 2] = src_pixel[src_pass_offset + 2];
+      dst_pixel[dst_pass_offset + 3] = src_pixel[src_pass_offset + 3];
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 4ffc628bb52..c048234167d 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -18,8 +18,8 @@
 #define __BUFFERS_H__
 
 #include "device/device_memory.h"
-
-#include "render/film.h"
+#include "graph/node.h"
+#include "render/pass.h"
 
 #include "kernel/kernel_types.h"
 
@@ -34,170 +34,156 @@ class Device;
 struct DeviceDrawParams;
 struct float4;
 
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferPass : public Node {
+ public:
+  NODE_DECLARE
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  ustring name;
+  bool include_albedo = false;
+
+  int offset = -1;
+
+  BufferPass();
+  explicit BufferPass(const Pass *scene_pass);
+
+  BufferPass(BufferPass &&other) noexcept = default;
+  BufferPass(const BufferPass &other) = default;
+
+  BufferPass &operator=(BufferPass &&other) = default;
+  BufferPass &operator=(const BufferPass &other) = default;
+
+  ~BufferPass() = default;
+
+  PassInfo get_info() const;
+
+  inline bool operator==(const BufferPass &other) const
+  {
+    return type == other.type && mode == other.mode && name == other.name &&
+           include_albedo == other.include_albedo && offset == other.offset;
+  }
+  inline bool operator!=(const BufferPass &other) const
+  {
+    return !(*this == other);
+  }
+};
+
 /* Buffer Parameters
  * Size of render buffer and how it fits in the full image (border render). */
 
-class BufferParams {
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferParams : public Node {
  public:
-  /* width/height of the physical buffer */
-  int width;
-  int height;
-
-  /* offset into and width/height of the full buffer */
-  int full_x;
-  int full_y;
-  int full_width;
-  int full_height;
-
-  /* passes */
-  vector<Pass> passes;
-  bool denoising_data_pass;
-  /* If only some light path types should be target, an additional pass is needed. */
-  bool denoising_clean_pass;
-  /* When we're prefiltering the passes during rendering, we need to keep both the
-   * original and the prefiltered data around because neighboring tiles might still
-   * need the original data. */
-  bool denoising_prefiltered_pass;
-
-  /* functions */
-  BufferParams();
+  NODE_DECLARE
 
-  void get_offset_stride(int &offset, int &stride);
-  bool modified(const BufferParams &params);
-  int get_passes_size();
-  int get_denoising_offset();
-  int get_denoising_prefiltered_offset();
-};
+  /* Width/height of the physical buffer. */
+  int width = 0;
+  int height = 0;
 
-/* Render Buffers */
+  /* Offset into and width/height of the full buffer. */
+  int full_x = 0;
+  int full_y = 0;
+  int full_width = 0;
+  int full_height = 0;
 
-class RenderBuffers {
- public:
-  /* buffer parameters */
-  BufferParams params;
+  /* Runtime fields, only valid after `update_passes()` or `update_offset_stride()`. */
+  int offset = -1, stride = -1;
 
-  /* float buffer */
-  device_vector<float> buffer;
-  bool map_neighbor_copied;
-  double render_time;
+  /* Runtime fields, only valid after `update_passes()`. */
+  int pass_stride = -1;
 
-  explicit RenderBuffers(Device *device);
-  ~RenderBuffers();
+  /* Properties which are used for accessing buffer pixels outside of scene graph. */
+  vector<BufferPass> passes;
+  ustring layer;
+  ustring view;
+  float exposure = 1.0f;
+  bool use_approximate_shadow_catcher = false;
+  bool use_transparent_background = false;
 
-  void reset(BufferParams &params);
-  void zero();
+  BufferParams();
 
-  bool copy_from_device();
-  bool get_pass_rect(
-      const string &name, float exposure, int sample, int components, float *pixels);
-  bool get_denoising_pass_rect(
-      int offset, float exposure, int sample, int components, float *pixels);
-  bool set_pass_rect(PassType type, int components, float *pixels, int samples);
-};
+  BufferParams(BufferParams &&other) noexcept = default;
+  BufferParams(const BufferParams &other) = default;
 
-/* Display Buffer
- *
- * The buffer used for drawing during render, filled by converting the render
- * buffers to byte of half float storage */
+  BufferParams &operator=(BufferParams &&other) = default;
+  BufferParams &operator=(const BufferParams &other) = default;
 
-class DisplayBuffer {
- public:
-  /* buffer parameters */
-  BufferParams params;
-  /* dimensions for how much of the buffer is actually ready for display.
-   * with progressive render we can be using only a subset of the buffer.
-   * if these are zero, it means nothing can be drawn yet */
-  int draw_width, draw_height;
-  /* draw alpha channel? */
-  bool transparent;
-  /* use half float? */
-  bool half_float;
-  /* byte buffer for converted result */
-  device_pixels<uchar4> rgba_byte;
-  device_pixels<half4> rgba_half;
-
-  DisplayBuffer(Device *device, bool linear = false);
-  ~DisplayBuffer();
-
-  void reset(BufferParams &params);
-
-  void draw_set(int width, int height);
-  void draw(Device *device, const DeviceDrawParams &draw_params);
-  bool draw_ready();
-};
+  ~BufferParams() = default;
 
-/* Render Tile
- * Rendering task on a buffer */
+  /* Pre-calculate all fields which depends on the passes.
+   *
+   * When the scene passes are given, the buffer passes will be created from them and stored in
+   * this params, and then params are updated for those passes.
+   * The `update_passes()` without parameters updates offsets and stries which are stored outside
+   * of the passes. */
+  void update_passes();
+  void update_passes(const vector<Pass *> &scene_passes);
 
-class RenderTile {
- public:
-  typedef enum { PATH_TRACE = (1 << 0), BAKE = (1 << 1), DENOISE = (1 << 2) } Task;
+  /* Returns PASS_UNUSED if there is no such pass in the buffer. */
+  int get_pass_offset(PassType type, PassMode mode = PassMode::NOISY) const;
 
-  Task task;
-  int x, y, w, h;
-  int start_sample;
-  int num_samples;
-  int sample;
-  int resolution;
-  int offset;
-  int stride;
-  int tile_index;
+  /* Returns nullptr if pass with given name does not exist. */
+  const BufferPass *find_pass(string_view name) const;
+  const BufferPass *find_pass(PassType type, PassMode mode = PassMode::NOISY) const;
 
-  device_ptr buffer;
-  int device_size;
+  /* Get display pass from its name.
+   * Will do special logic to replace combined pass with shadow catcher matte. */
+  const BufferPass *get_actual_display_pass(PassType type, PassMode mode = PassMode::NOISY) const;
+  const BufferPass *get_actual_display_pass(const BufferPass *pass) const;
 
-  typedef enum { NO_STEALING = 0, CAN_BE_STOLEN = 1, WAS_STOLEN = 2 } StealingState;
-  StealingState stealing_state;
+  void update_offset_stride();
 
-  RenderBuffers *buffers;
+  bool modified(const BufferParams &other) const;
 
-  RenderTile();
+ protected:
+  void reset_pass_offset();
 
-  int4 bounds() const
-  {
-    return make_int4(x,      /* xmin */
-                     y,      /* ymin */
-                     x + w,  /* xmax */
-                     y + h); /* ymax */
-  }
+  /* Multipled by 2 to be able to store noisy and denoised pass types. */
+  static constexpr int kNumPassOffsets = PASS_NUM * 2;
+
+  /* Indexed by an index derived from pass type and mode, indicates offset of the corresponding
+   * pass in the buffer.
+   * If there are multiple passes with same type and mode contains lowest offset of all of them. */
+  int pass_offset_[kNumPassOffsets];
 };
 
-/* Render Tile Neighbors
- * Set of neighboring tiles used for denoising. Tile order:
- *  0 1 2
- *  3 4 5
- *  6 7 8 */
+/* Render Buffers */
 
-class RenderTileNeighbors {
+class RenderBuffers {
  public:
-  static const int SIZE = 9;
-  static const int CENTER = 4;
+  /* buffer parameters */
+  BufferParams params;
 
-  RenderTile tiles[SIZE];
-  RenderTile target;
+  /* float buffer */
+  device_vector<float> buffer;
 
-  RenderTileNeighbors(const RenderTile &center)
-  {
-    tiles[CENTER] = center;
-  }
+  explicit RenderBuffers(Device *device);
+  ~RenderBuffers();
 
-  int4 bounds() const
-  {
-    return make_int4(tiles[3].x,               /* xmin */
-                     tiles[1].y,               /* ymin */
-                     tiles[5].x + tiles[5].w,  /* xmax */
-                     tiles[7].y + tiles[7].h); /* ymax */
-  }
+  void reset(const BufferParams &params);
+  void zero();
 
-  void set_bounds_from_center()
-  {
-    tiles[3].x = tiles[CENTER].x;
-    tiles[1].y = tiles[CENTER].y;
-    tiles[5].x = tiles[CENTER].x + tiles[CENTER].w;
-    tiles[7].y = tiles[CENTER].y + tiles[CENTER].h;
-  }
+  bool copy_from_device();
+  void copy_to_device();
 };
 
+/* Copy denoised passes form source to destination.
+ *
+ * Buffer parameters are provided explicitly, allowing to copy pixelks between render buffers which
+ * content corresponds to a render result at a non-unit resolution divider.
+ *
+ * `src_offset` allows to offset source pixel index which is used when a fraction of the source
+ * buffer is to be copied.
+ *
+ * Copy happens of the number of pixels in the destination. */
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+                                       const BufferParams &dst_params,
+                                       const RenderBuffers *src,
+                                       const BufferParams &src_params,
+                                       const size_t src_offset = 0);
+
 CCL_NAMESPACE_END
 
 #endif /* __BUFFERS_H__ */
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 327f166f9d8..8b69c971991 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -33,9 +33,9 @@
 
 /* needed for calculating differentials */
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_differential.h"
 #include "kernel/kernel_montecarlo.h"
@@ -169,7 +169,6 @@ Camera::Camera() : Node(get_node_type())
 
   width = 1024;
   height = 512;
-  resolution = 1;
 
   use_perspective_motion = false;
 
@@ -455,7 +454,6 @@ void Camera::update(Scene *scene)
   /* render size */
   kcam->width = width;
   kcam->height = height;
-  kcam->resolution = resolution;
 
   /* store differentials */
   kcam->dx = float3_to_float4(dx);
@@ -776,9 +774,11 @@ float Camera::world_to_raster_size(float3 P)
                            &ray);
 #endif
 
-    differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist);
+    /* TODO: would it help to use more accurate differentials here? */
+    differential3 dP;
+    differential_transfer_compact(&dP, ray.dP, ray.D, ray.dD, ray.D, dist);
 
-    return max(len(ray.dP.dx), len(ray.dP.dy));
+    return max(len(dP.dx), len(dP.dy));
   }
 
   return res;
@@ -789,12 +789,11 @@ bool Camera::use_motion() const
   return motion.size() > 1;
 }
 
-void Camera::set_screen_size_and_resolution(int width_, int height_, int resolution_)
+void Camera::set_screen_size(int width_, int height_)
 {
-  if (width_ != width || height_ != height || resolution_ != resolution) {
+  if (width_ != width || height_ != height) {
     width = width_;
     height = height_;
-    resolution = resolution_;
     tag_modified();
   }
 }
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 5abb4750764..cb8ecac1a7e 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -199,7 +199,6 @@ class Camera : public Node {
  private:
   int width;
   int height;
-  int resolution;
 
  public:
   /* functions */
@@ -225,7 +224,7 @@ class Camera : public Node {
   int motion_step(float time) const;
   bool use_motion() const;
 
-  void set_screen_size_and_resolution(int width_, int height_, int resolution_);
+  void set_screen_size(int width_, int height_);
 
  private:
   /* Private utility functions. */
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
deleted file mode 100644
index 99d4daa6961..00000000000
--- a/intern/cycles/render/coverage.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "render/coverage.h"
-#include "render/buffers.h"
-
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_id_passes.h"
-
-#include "util/util_map.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool crypomatte_comp(const pair<float, float> &i, const pair<float, float> j)
-{
-  return i.first > j.first;
-}
-
-void Coverage::finalize()
-{
-  int pass_offset = 0;
-  if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-    finalize_buffer(coverage_object, pass_offset);
-    pass_offset += kernel_data.film.cryptomatte_depth * 4;
-  }
-  if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-    finalize_buffer(coverage_material, pass_offset);
-    pass_offset += kernel_data.film.cryptomatte_depth * 4;
-  }
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-    finalize_buffer(coverage_asset, pass_offset);
-  }
-}
-
-void Coverage::init_path_trace()
-{
-  kg->coverage_object = kg->coverage_material = kg->coverage_asset = NULL;
-
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-      coverage_object.clear();
-      coverage_object.resize(tile.w * tile.h);
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-      coverage_material.clear();
-      coverage_material.resize(tile.w * tile.h);
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-      coverage_asset.clear();
-      coverage_asset.resize(tile.w * tile.h);
-    }
-  }
-}
-
-void Coverage::init_pixel(int x, int y)
-{
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    const int pixel_index = tile.w * (y - tile.y) + x - tile.x;
-    if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-      kg->coverage_object = &coverage_object[pixel_index];
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-      kg->coverage_material = &coverage_material[pixel_index];
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-      kg->coverage_asset = &coverage_asset[pixel_index];
-    }
-  }
-}
-
-void Coverage::finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    flatten_buffer(coverage, pass_offset);
-  }
-  else {
-    sort_buffer(pass_offset);
-  }
-}
-
-void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
-  /* Sort the coverage map and write it to the output */
-  int pixel_index = 0;
-  int pass_stride = tile.buffers->params.get_passes_size();
-  for (int y = 0; y < tile.h; ++y) {
-    for (int x = 0; x < tile.w; ++x) {
-      const CoverageMap &pixel = coverage[pixel_index];
-      if (!pixel.empty()) {
-        /* buffer offset */
-        int index = x + y * tile.stride;
-        float *buffer = (float *)tile.buffer + index * pass_stride;
-
-        /* sort the cryptomatte pixel */
-        vector<pair<float, float>> sorted_pixel;
-        for (CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) {
-          sorted_pixel.push_back(std::make_pair(it->second, it->first));
-        }
-        sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp);
-        int num_slots = 2 * (kernel_data.film.cryptomatte_depth);
-        if (sorted_pixel.size() > num_slots) {
-          float leftover = 0.0f;
-          for (vector<pair<float, float>>::iterator it = sorted_pixel.begin() + num_slots;
-               it != sorted_pixel.end();
-               ++it) {
-            leftover += it->first;
-          }
-          sorted_pixel[num_slots - 1].first += leftover;
-        }
-        int limit = min(num_slots, sorted_pixel.size());
-        for (int i = 0; i < limit; ++i) {
-          kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
-                                2 * (kernel_data.film.cryptomatte_depth),
-                                sorted_pixel[i].second,
-                                sorted_pixel[i].first);
-        }
-      }
-      ++pixel_index;
-    }
-  }
-}
-
-void Coverage::sort_buffer(const int pass_offset)
-{
-  /* Sort the coverage map and write it to the output */
-  int pass_stride = tile.buffers->params.get_passes_size();
-  for (int y = 0; y < tile.h; ++y) {
-    for (int x = 0; x < tile.w; ++x) {
-      /* buffer offset */
-      int index = x + y * tile.stride;
-      float *buffer = (float *)tile.buffer + index * pass_stride;
-      kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
-                           2 * (kernel_data.film.cryptomatte_depth));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
deleted file mode 100644
index 12182c614da..00000000000
--- a/intern/cycles/render/coverage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __COVERAGE_H__
-#define __COVERAGE_H__
-
-#include "util/util_map.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct KernelGlobals;
-class RenderTile;
-
-typedef unordered_map<float, float> CoverageMap;
-
-class Coverage {
- public:
-  Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_)
-  {
-  }
-  void init_path_trace();
-  void init_pixel(int x, int y);
-  void finalize();
-
- private:
-  vector<CoverageMap> coverage_object;
-  vector<CoverageMap> coverage_material;
-  vector<CoverageMap> coverage_asset;
-  KernelGlobals *kg;
-  RenderTile &tile;
-  void finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset);
-  void flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset);
-  void sort_buffer(const int pass_offset);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __COVERAGE_H__ */
diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp
index ddbe7484800..bcf8d3fa204 100644
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -16,15 +16,17 @@
 
 #include "render/denoising.h"
 
-#include "kernel/filter/filter_defines.h"
+#if 0
 
-#include "util/util_foreach.h"
-#include "util/util_map.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_time.h"
+#  include "kernel/filter/filter_defines.h"
 
-#include <OpenImageIO/filesystem.h>
+#  include "util/util_foreach.h"
+#  include "util/util_map.h"
+#  include "util/util_system.h"
+#  include "util/util_task.h"
+#  include "util/util_time.h"
+
+#  include <OpenImageIO/filesystem.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -225,7 +227,7 @@ bool DenoiseImageLayer::match_channels(int neighbor,
 /* Denoise Task */
 
 DenoiseTask::DenoiseTask(Device *device,
-                         Denoiser *denoiser,
+                         DenoiserPipeline *denoiser,
                          int frame,
                          const vector<int> &neighbor_frames)
     : denoiser(denoiser),
@@ -386,7 +388,6 @@ void DenoiseTask::create_task(DeviceTask &task)
   task.denoising = denoiser->params;
   task.denoising.type = DENOISER_NLM;
   task.denoising.use = true;
-  task.denoising.store_passes = false;
   task.denoising_from_render = false;
 
   task.denoising_frames.resize(neighbor_frames.size());
@@ -863,7 +864,7 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error)
 
 /* File pattern handling and outer loop over frames */
 
-Denoiser::Denoiser(DeviceInfo &device_info)
+DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info)
 {
   samples_override = 0;
   tile_size = make_int2(64, 64);
@@ -876,18 +877,16 @@ Denoiser::Denoiser(DeviceInfo &device_info)
   /* Initialize device. */
   device = Device::create(device_info, stats, profiler, true);
 
-  DeviceRequestedFeatures req;
-  req.use_denoising = true;
-  device->load_kernels(req);
+  device->load_kernels(KERNEL_FEATURE_DENOISING);
 }
 
-Denoiser::~Denoiser()
+DenoiserPipeline::~DenoiserPipeline()
 {
   delete device;
   TaskScheduler::exit();
 }
 
-bool Denoiser::run()
+bool DenoiserPipeline::run()
 {
   assert(input.size() == output.size());
 
@@ -931,3 +930,5 @@ bool Denoiser::run()
 }
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h
index c1b4d0a5596..097cc570d06 100644
--- a/intern/cycles/render/denoising.h
+++ b/intern/cycles/render/denoising.h
@@ -17,27 +17,31 @@
 #ifndef __DENOISING_H__
 #define __DENOISING_H__
 
-#include "device/device.h"
-#include "device/device_denoising.h"
+#if 0
 
-#include "render/buffers.h"
+/* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or
+ * parameters. Currently it is an annoying mixture of terms used interchangeably. */
 
-#include "util/util_string.h"
-#include "util/util_unique_ptr.h"
-#include "util/util_vector.h"
+#  include "device/device.h"
 
-#include <OpenImageIO/imageio.h>
+#  include "render/buffers.h"
+
+#  include "util/util_string.h"
+#  include "util/util_unique_ptr.h"
+#  include "util/util_vector.h"
+
+#  include <OpenImageIO/imageio.h>
 
 OIIO_NAMESPACE_USING
 
 CCL_NAMESPACE_BEGIN
 
-/* Denoiser */
+/* Denoiser pipeline */
 
-class Denoiser {
+class DenoiserPipeline {
  public:
-  Denoiser(DeviceInfo &device_info);
-  ~Denoiser();
+  DenoiserPipeline(DeviceInfo &device_info);
+  ~DenoiserPipeline();
 
   bool run();
 
@@ -155,7 +159,10 @@ class DenoiseImage {
 
 class DenoiseTask {
  public:
-  DenoiseTask(Device *device, Denoiser *denoiser, int frame, const vector<int> &neighbor_frames);
+  DenoiseTask(Device *device,
+              DenoiserPipeline *denoiser,
+              int frame,
+              const vector<int> &neighbor_frames);
   ~DenoiseTask();
 
   /* Task stages */
@@ -168,7 +175,7 @@ class DenoiseTask {
 
  protected:
   /* Denoiser parameters and device */
-  Denoiser *denoiser;
+  DenoiserPipeline *denoiser;
   Device *device;
 
   /* Frame number to be denoised */
@@ -204,4 +211,6 @@ class DenoiseTask {
 
 CCL_NAMESPACE_END
 
+#endif
+
 #endif /* __DENOISING_H__ */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 5df396394c4..8e14b338bd3 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -16,9 +16,12 @@
 
 #include "render/film.h"
 #include "device/device.h"
+#include "render/background.h"
+#include "render/bake.h"
 #include "render/camera.h"
 #include "render/integrator.h"
 #include "render/mesh.h"
+#include "render/object.h"
 #include "render/scene.h"
 #include "render/stats.h"
 #include "render/tables.h"
@@ -31,261 +34,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Pass */
-
-static bool compare_pass_order(const Pass &a, const Pass &b)
-{
-  if (a.components == b.components)
-    return (a.type < b.type);
-  return (a.components > b.components);
-}
-
-static NodeEnum *get_pass_type_enum()
-{
-  static NodeEnum pass_type_enum;
-  pass_type_enum.insert("combined", PASS_COMBINED);
-  pass_type_enum.insert("depth", PASS_DEPTH);
-  pass_type_enum.insert("normal", PASS_NORMAL);
-  pass_type_enum.insert("uv", PASS_UV);
-  pass_type_enum.insert("object_id", PASS_OBJECT_ID);
-  pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
-  pass_type_enum.insert("motion", PASS_MOTION);
-  pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
-  pass_type_enum.insert("render_time", PASS_RENDER_TIME);
-  pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
-  pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
-  pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
-  pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
-  pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
-  pass_type_enum.insert("mist", PASS_MIST);
-  pass_type_enum.insert("emission", PASS_EMISSION);
-  pass_type_enum.insert("background", PASS_BACKGROUND);
-  pass_type_enum.insert("ambient_occlusion", PASS_AO);
-  pass_type_enum.insert("shadow", PASS_SHADOW);
-  pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
-  pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
-  pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
-  pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
-  pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
-  pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
-  pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
-  pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
-  pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
-  pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
-  pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
-  pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
-  pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
-
-  return &pass_type_enum;
-}
-
-NODE_DEFINE(Pass)
-{
-  NodeType *type = NodeType::add("pass", create);
-
-  NodeEnum *pass_type_enum = get_pass_type_enum();
-  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
-  SOCKET_STRING(name, "Name", ustring());
-
-  return type;
-}
-
-Pass::Pass() : Node(get_node_type())
-{
-}
-
-void Pass::add(PassType type, vector<Pass> &passes, const char *name)
-{
-  for (size_t i = 0; i < passes.size(); i++) {
-    if (passes[i].type != type) {
-      continue;
-    }
-
-    /* An empty name is used as a placeholder to signal that any pass of
-     * that type is fine (because the content always is the same).
-     * This is important to support divide_type: If the pass that has a
-     * divide_type is added first, a pass for divide_type with an empty
-     * name will be added. Then, if a matching pass with a name is later
-     * requested, the existing placeholder will be renamed to that.
-     * If the divide_type is explicitly allocated with a name first and
-     * then again as part of another pass, the second one will just be
-     * skipped because that type already exists. */
-
-    /* If no name is specified, any pass of the correct type will match. */
-    if (name == NULL) {
-      return;
-    }
-
-    /* If we already have a placeholder pass, rename that one. */
-    if (passes[i].name.empty()) {
-      passes[i].name = name;
-      return;
-    }
-
-    /* If neither existing nor requested pass have placeholder name, they
-     * must match. */
-    if (name == passes[i].name) {
-      return;
-    }
-  }
-
-  Pass pass;
-
-  pass.type = type;
-  pass.filter = true;
-  pass.exposure = false;
-  pass.divide_type = PASS_NONE;
-  if (name) {
-    pass.name = name;
-  }
-
-  switch (type) {
-    case PASS_NONE:
-      pass.components = 0;
-      break;
-    case PASS_COMBINED:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_DEPTH:
-      pass.components = 1;
-      pass.filter = false;
-      break;
-    case PASS_MIST:
-      pass.components = 1;
-      break;
-    case PASS_NORMAL:
-      pass.components = 4;
-      break;
-    case PASS_UV:
-      pass.components = 4;
-      break;
-    case PASS_MOTION:
-      pass.components = 4;
-      pass.divide_type = PASS_MOTION_WEIGHT;
-      break;
-    case PASS_MOTION_WEIGHT:
-      pass.components = 1;
-      break;
-    case PASS_OBJECT_ID:
-    case PASS_MATERIAL_ID:
-      pass.components = 1;
-      pass.filter = false;
-      break;
-
-    case PASS_EMISSION:
-    case PASS_BACKGROUND:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_AO:
-      pass.components = 4;
-      break;
-    case PASS_SHADOW:
-      pass.components = 4;
-      pass.exposure = false;
-      break;
-    case PASS_LIGHT:
-      /* This isn't a real pass, used by baking to see whether
-       * light data is needed or not.
-       *
-       * Set components to 0 so pass sort below happens in a
-       * determined way.
-       */
-      pass.components = 0;
-      break;
-    case PASS_RENDER_TIME:
-      /* This pass is handled entirely on the host side. */
-      pass.components = 0;
-      break;
-
-    case PASS_DIFFUSE_COLOR:
-    case PASS_GLOSSY_COLOR:
-    case PASS_TRANSMISSION_COLOR:
-      pass.components = 4;
-      break;
-    case PASS_DIFFUSE_DIRECT:
-    case PASS_DIFFUSE_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_DIFFUSE_COLOR;
-      break;
-    case PASS_GLOSSY_DIRECT:
-    case PASS_GLOSSY_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_GLOSSY_COLOR;
-      break;
-    case PASS_TRANSMISSION_DIRECT:
-    case PASS_TRANSMISSION_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_TRANSMISSION_COLOR;
-      break;
-    case PASS_VOLUME_DIRECT:
-    case PASS_VOLUME_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_CRYPTOMATTE:
-      pass.components = 4;
-      break;
-    case PASS_ADAPTIVE_AUX_BUFFER:
-      pass.components = 4;
-      break;
-    case PASS_SAMPLE_COUNT:
-      pass.components = 1;
-      pass.exposure = false;
-      break;
-    case PASS_AOV_COLOR:
-      pass.components = 4;
-      break;
-    case PASS_AOV_VALUE:
-      pass.components = 1;
-      break;
-    case PASS_BAKE_PRIMITIVE:
-    case PASS_BAKE_DIFFERENTIAL:
-      pass.components = 4;
-      pass.exposure = false;
-      pass.filter = false;
-      break;
-    default:
-      assert(false);
-      break;
-  }
-
-  passes.push_back(pass);
-
-  /* Order from by components, to ensure alignment so passes with size 4
-   * come first and then passes with size 1. Note this must use stable sort
-   * so cryptomatte passes remain in the right order. */
-  stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
-
-  if (pass.divide_type != PASS_NONE)
-    Pass::add(pass.divide_type, passes);
-}
-
-bool Pass::equals(const vector<Pass> &A, const vector<Pass> &B)
-{
-  if (A.size() != B.size())
-    return false;
-
-  for (int i = 0; i < A.size(); i++)
-    if (A[i].type != B[i].type || A[i].name != B[i].name)
-      return false;
-
-  return true;
-}
-
-bool Pass::contains(const vector<Pass> &passes, PassType type)
-{
-  for (size_t i = 0; i < passes.size(); i++)
-    if (passes[i].type == type)
-      return true;
-
-  return false;
-}
-
 /* Pixel Filter */
 
 static float filter_func_box(float /*v*/, float /*width*/)
@@ -368,17 +116,11 @@ NODE_DEFINE(Film)
   SOCKET_FLOAT(mist_depth, "Mist Depth", 100.0f);
   SOCKET_FLOAT(mist_falloff, "Mist Falloff", 1.0f);
 
-  SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
-  SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
-  SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
-  SOCKET_INT(denoising_flags, "Denoising Flags", 0);
-  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
-
-  SOCKET_BOOLEAN(use_light_visibility, "Use Light Visibility", false);
-
-  NodeEnum *pass_type_enum = get_pass_type_enum();
+  const NodeEnum *pass_type_enum = Pass::get_type_enum();
   SOCKET_ENUM(display_pass, "Display Pass", *pass_type_enum, PASS_COMBINED);
 
+  SOCKET_BOOLEAN(show_active_pixels, "Show Active Pixels", false);
+
   static NodeEnum cryptomatte_passes_enum;
   cryptomatte_passes_enum.insert("none", CRYPT_NONE);
   cryptomatte_passes_enum.insert("object", CRYPT_OBJECT);
@@ -389,15 +131,13 @@ NODE_DEFINE(Film)
 
   SOCKET_INT(cryptomatte_depth, "Cryptomatte Depth", 0);
 
+  SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+
   return type;
 }
 
-Film::Film() : Node(get_node_type())
+Film::Film() : Node(get_node_type()), filter_table_offset_(TABLE_OFFSET_INVALID)
 {
-  use_light_visibility = false;
-  filter_table_offset = TABLE_OFFSET_INVALID;
-  cryptomatte_passes = CRYPT_NONE;
-  display_pass = PASS_COMBINED;
 }
 
 Film::~Film()
@@ -406,7 +146,8 @@ Film::~Film()
 
 void Film::add_default(Scene *scene)
 {
-  Pass::add(PASS_COMBINED, scene->passes);
+  Pass *pass = scene->create_node<Pass>();
+  pass->set_type(PASS_COMBINED);
 }
 
 void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
@@ -426,50 +167,77 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 
   /* update __data */
   kfilm->exposure = exposure;
+  kfilm->pass_alpha_threshold = pass_alpha_threshold;
   kfilm->pass_flag = 0;
 
-  kfilm->display_pass_stride = -1;
-  kfilm->display_pass_components = 0;
-  kfilm->display_divide_pass_stride = -1;
-  kfilm->use_display_exposure = false;
-  kfilm->use_display_pass_alpha = (display_pass == PASS_COMBINED);
+  kfilm->use_approximate_shadow_catcher = get_use_approximate_shadow_catcher();
 
   kfilm->light_pass_flag = 0;
   kfilm->pass_stride = 0;
-  kfilm->use_light_pass = use_light_visibility;
-  kfilm->pass_aov_value_num = 0;
-  kfilm->pass_aov_color_num = 0;
+
+  /* Mark with PASS_UNUSED to avoid mask test in the kernel. */
+  kfilm->pass_background = PASS_UNUSED;
+  kfilm->pass_emission = PASS_UNUSED;
+  kfilm->pass_ao = PASS_UNUSED;
+  kfilm->pass_diffuse_direct = PASS_UNUSED;
+  kfilm->pass_diffuse_indirect = PASS_UNUSED;
+  kfilm->pass_glossy_direct = PASS_UNUSED;
+  kfilm->pass_glossy_indirect = PASS_UNUSED;
+  kfilm->pass_transmission_direct = PASS_UNUSED;
+  kfilm->pass_transmission_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_shadow = PASS_UNUSED;
+
+  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
+  kfilm->pass_denoising_normal = PASS_UNUSED;
+  kfilm->pass_denoising_albedo = PASS_UNUSED;
+  kfilm->pass_sample_count = PASS_UNUSED;
+  kfilm->pass_adaptive_aux_buffer = PASS_UNUSED;
+  kfilm->pass_shadow_catcher = PASS_UNUSED;
+  kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED;
+  kfilm->pass_shadow_catcher_matte = PASS_UNUSED;
 
   bool have_cryptomatte = false;
+  bool have_aov_color = false;
+  bool have_aov_value = false;
 
   for (size_t i = 0; i < scene->passes.size(); i++) {
-    Pass &pass = scene->passes[i];
+    const Pass *pass = scene->passes[i];
 
-    if (pass.type == PASS_NONE) {
+    if (pass->get_type() == PASS_NONE || !pass->is_written()) {
+      continue;
+    }
+
+    if (pass->get_mode() == PassMode::DENOISED) {
+      /* Generally we only storing offsets of the noisy passes. The display pass is an exception
+       * since it is a read operation and not a write. */
+      kfilm->pass_stride += pass->get_info().num_components;
       continue;
     }
 
     /* Can't do motion pass if no motion vectors are available. */
-    if (pass.type == PASS_MOTION || pass.type == PASS_MOTION_WEIGHT) {
+    if (pass->get_type() == PASS_MOTION || pass->get_type() == PASS_MOTION_WEIGHT) {
       if (scene->need_motion() != Scene::MOTION_PASS) {
-        kfilm->pass_stride += pass.components;
+        kfilm->pass_stride += pass->get_info().num_components;
         continue;
       }
     }
 
-    int pass_flag = (1 << (pass.type % 32));
-    if (pass.type <= PASS_CATEGORY_MAIN_END) {
-      kfilm->pass_flag |= pass_flag;
-    }
-    else if (pass.type <= PASS_CATEGORY_LIGHT_END) {
-      kfilm->use_light_pass = 1;
+    const int pass_flag = (1 << (pass->get_type() % 32));
+    if (pass->get_type() <= PASS_CATEGORY_LIGHT_END) {
       kfilm->light_pass_flag |= pass_flag;
     }
+    else if (pass->get_type() <= PASS_CATEGORY_DATA_END) {
+      kfilm->pass_flag |= pass_flag;
+    }
     else {
-      assert(pass.type <= PASS_CATEGORY_BAKE_END);
+      assert(pass->get_type() <= PASS_CATEGORY_BAKE_END);
     }
 
-    switch (pass.type) {
+    switch (pass->get_type()) {
       case PASS_COMBINED:
         kfilm->pass_combined = kfilm->pass_stride;
         break;
@@ -479,6 +247,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_NORMAL:
         kfilm->pass_normal = kfilm->pass_stride;
         break;
+      case PASS_POSITION:
+        kfilm->pass_position = kfilm->pass_stride;
+        break;
+      case PASS_ROUGHNESS:
+        kfilm->pass_roughness = kfilm->pass_stride;
+        break;
       case PASS_UV:
         kfilm->pass_uv = kfilm->pass_stride;
         break;
@@ -511,9 +285,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
         kfilm->pass_shadow = kfilm->pass_stride;
         break;
 
-      case PASS_LIGHT:
-        break;
-
       case PASS_DIFFUSE_COLOR:
         kfilm->pass_diffuse_color = kfilm->pass_stride;
         break;
@@ -563,78 +334,56 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+
+      case PASS_DENOISING_NORMAL:
+        kfilm->pass_denoising_normal = kfilm->pass_stride;
+        break;
+      case PASS_DENOISING_ALBEDO:
+        kfilm->pass_denoising_albedo = kfilm->pass_stride;
+        break;
+
+      case PASS_SHADOW_CATCHER:
+        kfilm->pass_shadow_catcher = kfilm->pass_stride;
+        break;
+      case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+        kfilm->pass_shadow_catcher_sample_count = kfilm->pass_stride;
+        break;
+      case PASS_SHADOW_CATCHER_MATTE:
+        kfilm->pass_shadow_catcher_matte = kfilm->pass_stride;
+        break;
+
       case PASS_ADAPTIVE_AUX_BUFFER:
         kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
         break;
       case PASS_SAMPLE_COUNT:
         kfilm->pass_sample_count = kfilm->pass_stride;
         break;
+
       case PASS_AOV_COLOR:
-        if (kfilm->pass_aov_color_num == 0) {
+        if (!have_aov_color) {
           kfilm->pass_aov_color = kfilm->pass_stride;
+          have_aov_color = true;
         }
-        kfilm->pass_aov_color_num++;
         break;
       case PASS_AOV_VALUE:
-        if (kfilm->pass_aov_value_num == 0) {
+        if (!have_aov_value) {
           kfilm->pass_aov_value = kfilm->pass_stride;
+          have_aov_value = true;
         }
-        kfilm->pass_aov_value_num++;
         break;
       default:
         assert(false);
         break;
     }
 
-    if (pass.type == display_pass) {
-      kfilm->display_pass_stride = kfilm->pass_stride;
-      kfilm->display_pass_components = pass.components;
-      kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f);
-    }
-    else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR ||
-             pass.type == PASS_GLOSSY_COLOR) {
-      kfilm->display_divide_pass_stride = kfilm->pass_stride;
-    }
-
-    kfilm->pass_stride += pass.components;
-  }
-
-  kfilm->pass_denoising_data = 0;
-  kfilm->pass_denoising_clean = 0;
-  kfilm->denoising_flags = 0;
-  if (denoising_data_pass) {
-    kfilm->pass_denoising_data = kfilm->pass_stride;
-    kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
-    kfilm->denoising_flags = denoising_flags;
-    if (denoising_clean_pass) {
-      kfilm->pass_denoising_clean = kfilm->pass_stride;
-      kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
-      kfilm->use_light_pass = 1;
-    }
-    if (denoising_prefiltered_pass) {
-      kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
-    }
-  }
-
-  kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
-
-  /* When displaying the normal/uv pass in the viewport we need to disable
-   * transparency.
-   *
-   * We also don't need to perform light accumulations. Later we want to optimize this to suppress
-   * light calculations. */
-  if (display_pass == PASS_NORMAL || display_pass == PASS_UV) {
-    kfilm->use_light_pass = 0;
-  }
-  else {
-    kfilm->pass_alpha_threshold = pass_alpha_threshold;
+    kfilm->pass_stride += pass->get_info().num_components;
   }
 
   /* update filter table */
   vector<float> table = filter_table(filter_type, filter_width);
-  scene->lookup_tables->remove_table(&filter_table_offset);
-  filter_table_offset = scene->lookup_tables->add_table(dscene, table);
-  kfilm->filter_table_offset = (int)filter_table_offset;
+  scene->lookup_tables->remove_table(&filter_table_offset_);
+  filter_table_offset_ = scene->lookup_tables->add_table(dscene, table);
+  kfilm->filter_table_offset = (int)filter_table_offset_;
 
   /* mist pass parameters */
   kfilm->mist_start = mist_start;
@@ -644,79 +393,298 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
   kfilm->cryptomatte_passes = cryptomatte_passes;
   kfilm->cryptomatte_depth = cryptomatte_depth;
 
-  pass_stride = kfilm->pass_stride;
-  denoising_data_offset = kfilm->pass_denoising_data;
-  denoising_clean_offset = kfilm->pass_denoising_clean;
-
   clear_modified();
 }
 
 void Film::device_free(Device * /*device*/, DeviceScene * /*dscene*/, Scene *scene)
 {
-  scene->lookup_tables->remove_table(&filter_table_offset);
+  scene->lookup_tables->remove_table(&filter_table_offset_);
 }
 
-void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes)
+int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
 {
-  if (Pass::contains(scene->passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
-    scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
+  int offset_color = 0, offset_value = 0;
+  foreach (const Pass *pass, scene->passes) {
+    if (pass->get_name() == name) {
+      if (pass->get_type() == PASS_AOV_VALUE) {
+        is_color = false;
+        return offset_value;
+      }
+      else if (pass->get_type() == PASS_AOV_COLOR) {
+        is_color = true;
+        return offset_color;
+      }
+    }
+
+    if (pass->get_type() == PASS_AOV_VALUE) {
+      offset_value += pass->get_info().num_components;
+    }
+    else if (pass->get_type() == PASS_AOV_COLOR) {
+      offset_color += pass->get_info().num_components;
+    }
+  }
+
+  return -1;
+}
+
+void Film::update_passes(Scene *scene, bool add_sample_count_pass)
+{
+  const Background *background = scene->background;
+  const BakeManager *bake_manager = scene->bake_manager;
+  const ObjectManager *object_manager = scene->object_manager;
+  Integrator *integrator = scene->integrator;
+
+  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
+    return;
+  }
+
+  /* Remove auto generated passes and recreate them. */
+  remove_auto_passes(scene);
+
+  /* Display pass for viewport. */
+  const PassType display_pass = get_display_pass();
+  add_auto_pass(scene, display_pass);
+
+  /* Assumption is that a combined pass always exists for now, for example
+   * adaptive sampling is always based on a combined pass. But we should
+   * try to lift this limitation in the future for faster rendering of
+   * individual passes. */
+  if (display_pass != PASS_COMBINED) {
+    add_auto_pass(scene, PASS_COMBINED);
+  }
+
+  /* Create passes needed for adaptive sampling. */
+  const AdaptiveSampling adaptive_sampling = integrator->get_adaptive_sampling();
+  if (adaptive_sampling.use) {
+    add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    add_auto_pass(scene, PASS_ADAPTIVE_AUX_BUFFER);
+  }
+
+  /* Create passes needed for denoising. */
+  const bool use_denoise = integrator->get_use_denoise();
+  if (use_denoise) {
+    if (integrator->get_use_denoise_pass_normal()) {
+      add_auto_pass(scene, PASS_DENOISING_NORMAL);
+    }
+    if (integrator->get_use_denoise_pass_albedo()) {
+      add_auto_pass(scene, PASS_DENOISING_ALBEDO);
+    }
+  }
+
+  /* Create passes for shadow catcher. */
+  if (scene->has_shadow_catcher()) {
+    const bool need_background = get_use_approximate_shadow_catcher() &&
+                                 !background->get_transparent();
+
+    add_auto_pass(scene, PASS_SHADOW_CATCHER);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_MATTE);
+
+    if (need_background) {
+      add_auto_pass(scene, PASS_BACKGROUND);
+    }
+  }
+  else if (Pass::contains(scene->passes, PASS_SHADOW_CATCHER)) {
+    add_auto_pass(scene, PASS_SHADOW_CATCHER);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  }
+
+  const vector<Pass *> passes_immutable = scene->passes;
+  for (const Pass *pass : passes_immutable) {
+    const PassInfo info = pass->get_info();
+    /* Add utility passes needed to generate some light passes. */
+    if (info.divide_type != PASS_NONE) {
+      add_auto_pass(scene, info.divide_type);
+    }
+    if (info.direct_type != PASS_NONE) {
+      add_auto_pass(scene, info.direct_type);
+    }
+    if (info.indirect_type != PASS_NONE) {
+      add_auto_pass(scene, info.indirect_type);
+    }
+
+    /* NOTE: Enable all denoised passes when storage is requested.
+     * This way it is possible to tweak denoiser parameters later on. */
+    if (info.support_denoise && use_denoise) {
+      add_auto_pass(scene, pass->get_type(), PassMode::DENOISED);
+    }
+  }
+
+  if (bake_manager->get_baking()) {
+    add_auto_pass(scene, PASS_BAKE_PRIMITIVE, "BakePrimitive");
+    add_auto_pass(scene, PASS_BAKE_DIFFERENTIAL, "BakeDifferential");
+  }
+
+  if (add_sample_count_pass) {
+    if (!Pass::contains(scene->passes, PASS_SAMPLE_COUNT)) {
+      add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    }
+  }
+
+  /* Remove duplicates and initialize internal pass info. */
+  finalize_passes(scene, use_denoise);
 
+  /* Flush scene updates. */
+  const bool have_uv_pass = Pass::contains(scene->passes, PASS_UV);
+  const bool have_motion_pass = Pass::contains(scene->passes, PASS_MOTION);
+  const bool have_ao_pass = Pass::contains(scene->passes, PASS_AO);
+
+  if (have_uv_pass != prev_have_uv_pass) {
+    scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
     foreach (Shader *shader, scene->shaders)
       shader->need_update_uvs = true;
   }
-  else if (Pass::contains(scene->passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) {
+  if (have_motion_pass != prev_have_motion_pass) {
     scene->geometry_manager->tag_update(scene, GeometryManager::MOTION_PASS_NEEDED);
   }
-  else if (Pass::contains(scene->passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) {
+  if (have_ao_pass != prev_have_ao_pass) {
     scene->integrator->tag_update(scene, Integrator::AO_PASS_MODIFIED);
   }
 
-  if (update_passes) {
-    scene->passes = passes_;
+  prev_have_uv_pass = have_uv_pass;
+  prev_have_motion_pass = have_motion_pass;
+  prev_have_ao_pass = have_ao_pass;
+
+  tag_modified();
+
+  /* Debug logging. */
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Effective scene passes:";
+    for (const Pass *pass : scene->passes) {
+      VLOG(2) << "- " << *pass;
+    }
   }
 }
 
-int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
+void Film::add_auto_pass(Scene *scene, PassType type, const char *name)
 {
-  int num_color = 0, num_value = 0;
-  foreach (const Pass &pass, scene->passes) {
-    if (pass.type == PASS_AOV_COLOR) {
-      num_color++;
-    }
-    else if (pass.type == PASS_AOV_VALUE) {
-      num_value++;
+  add_auto_pass(scene, type, PassMode::NOISY, name);
+}
+
+void Film::add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name)
+{
+  Pass *pass = new Pass();
+  pass->set_type(type);
+  pass->set_mode(mode);
+  pass->set_name(ustring((name) ? name : ""));
+  pass->is_auto_ = true;
+
+  pass->set_owner(scene);
+  scene->passes.push_back(pass);
+}
+
+void Film::remove_auto_passes(Scene *scene)
+{
+  /* Remove all passes which were automatically created. */
+  vector<Pass *> new_passes;
+
+  for (Pass *pass : scene->passes) {
+    if (!pass->is_auto_) {
+      new_passes.push_back(pass);
     }
     else {
-      continue;
-    }
-
-    if (pass.name == name) {
-      is_color = (pass.type == PASS_AOV_COLOR);
-      return (is_color ? num_color : num_value) - 1;
+      delete pass;
     }
   }
 
-  return -1;
+  scene->passes = new_passes;
 }
 
-int Film::get_pass_stride() const
+static bool compare_pass_order(const Pass *a, const Pass *b)
 {
-  return pass_stride;
-}
+  const int num_components_a = a->get_info().num_components;
+  const int num_components_b = b->get_info().num_components;
 
-int Film::get_denoising_data_offset() const
-{
-  return denoising_data_offset;
+  if (num_components_a == num_components_b) {
+    return (a->get_type() < b->get_type());
+  }
+
+  return num_components_a > num_components_b;
 }
 
-int Film::get_denoising_clean_offset() const
+void Film::finalize_passes(Scene *scene, const bool use_denoise)
 {
-  return denoising_clean_offset;
+  /* Remove duplicate passes. */
+  vector<Pass *> new_passes;
+
+  for (Pass *pass : scene->passes) {
+    /* Disable denoising on passes if denoising is disabled, or if the
+     * pass does not support it. */
+    pass->set_mode((use_denoise && pass->get_info().support_denoise) ? pass->get_mode() :
+                                                                       PassMode::NOISY);
+
+    /* Merge duplicate passes. */
+    bool duplicate_found = false;
+    for (Pass *new_pass : new_passes) {
+      /* If different type or denoising, don't merge. */
+      if (new_pass->get_type() != pass->get_type() || new_pass->get_mode() != pass->get_mode()) {
+        continue;
+      }
+
+      /* If both passes have a name and the names are different, don't merge.
+       * If either pass has a name, we'll use that name. */
+      if (!pass->get_name().empty() && !new_pass->get_name().empty() &&
+          pass->get_name() != new_pass->get_name()) {
+        continue;
+      }
+
+      if (!pass->get_name().empty() && new_pass->get_name().empty()) {
+        new_pass->set_name(pass->get_name());
+      }
+
+      new_pass->is_auto_ &= pass->is_auto_;
+      duplicate_found = true;
+      break;
+    }
+
+    if (!duplicate_found) {
+      new_passes.push_back(pass);
+    }
+    else {
+      delete pass;
+    }
+  }
+
+  /* Order from by components and type, This is required to for AOVs and cryptomatte passes,
+   * which the kernel assumes to be in order. Note this must use stable sort so cryptomatte
+   * passes remain in the right order. */
+  stable_sort(new_passes.begin(), new_passes.end(), compare_pass_order);
+
+  scene->passes = new_passes;
 }
 
-size_t Film::get_filter_table_offset() const
+uint Film::get_kernel_features(const Scene *scene) const
 {
-  return filter_table_offset;
+  uint kernel_features = 0;
+
+  for (const Pass *pass : scene->passes) {
+    if (!pass->is_written()) {
+      continue;
+    }
+
+    const PassType pass_type = pass->get_type();
+    const PassMode pass_mode = pass->get_mode();
+
+    if (pass_mode == PassMode::DENOISED || pass_type == PASS_DENOISING_NORMAL ||
+        pass_type == PASS_DENOISING_ALBEDO) {
+      kernel_features |= KERNEL_FEATURE_DENOISING;
+    }
+
+    if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
+        pass_type <= PASS_CATEGORY_LIGHT_END) {
+      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
+
+      if (pass_type == PASS_SHADOW) {
+        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      }
+    }
+
+    if (pass_type == PASS_AO) {
+      kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
+    }
+  }
+
+  return kernel_features;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 462a7275491..5d327353361 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,6 +17,7 @@
 #ifndef __FILM_H__
 #define __FILM_H__
 
+#include "render/pass.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -38,36 +39,15 @@ typedef enum FilterType {
   FILTER_NUM_TYPES,
 } FilterType;
 
-class Pass : public Node {
- public:
-  NODE_DECLARE
-
-  Pass();
-
-  PassType type;
-  int components;
-  bool filter;
-  bool exposure;
-  PassType divide_type;
-  ustring name;
-
-  static void add(PassType type, vector<Pass> &passes, const char *name = NULL);
-  static bool equals(const vector<Pass> &A, const vector<Pass> &B);
-  static bool contains(const vector<Pass> &passes, PassType);
-};
-
 class Film : public Node {
  public:
   NODE_DECLARE
 
   NODE_SOCKET_API(float, exposure)
-  NODE_SOCKET_API(bool, denoising_data_pass)
-  NODE_SOCKET_API(bool, denoising_clean_pass)
-  NODE_SOCKET_API(bool, denoising_prefiltered_pass)
-  NODE_SOCKET_API(int, denoising_flags)
   NODE_SOCKET_API(float, pass_alpha_threshold)
 
   NODE_SOCKET_API(PassType, display_pass)
+  NODE_SOCKET_API(bool, show_active_pixels)
 
   NODE_SOCKET_API(FilterType, filter_type)
   NODE_SOCKET_API(float, filter_width)
@@ -76,17 +56,18 @@ class Film : public Node {
   NODE_SOCKET_API(float, mist_depth)
   NODE_SOCKET_API(float, mist_falloff)
 
-  NODE_SOCKET_API(bool, use_light_visibility)
   NODE_SOCKET_API(CryptomatteType, cryptomatte_passes)
   NODE_SOCKET_API(int, cryptomatte_depth)
 
-  NODE_SOCKET_API(bool, use_adaptive_sampling)
+  /* Approximate shadow catcher pass into its matte pass, so that both artificial objects and
+   * shadows can be alpha-overed onto a backdrop. */
+  NODE_SOCKET_API(bool, use_approximate_shadow_catcher)
 
  private:
-  int pass_stride;
-  int denoising_data_offset;
-  int denoising_clean_offset;
-  size_t filter_table_offset;
+  size_t filter_table_offset_;
+  bool prev_have_uv_pass = false;
+  bool prev_have_motion_pass = false;
+  bool prev_have_ao_pass = false;
 
  public:
   Film();
@@ -98,14 +79,20 @@ class Film : public Node {
   void device_update(Device *device, DeviceScene *dscene, Scene *scene);
   void device_free(Device *device, DeviceScene *dscene, Scene *scene);
 
-  void tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes = true);
-
   int get_aov_offset(Scene *scene, string name, bool &is_color);
 
-  int get_pass_stride() const;
-  int get_denoising_data_offset() const;
-  int get_denoising_clean_offset() const;
-  size_t get_filter_table_offset() const;
+  /* Update passes so that they contain all passes required for the configured functionality.
+   *
+   * If `add_sample_count_pass` is true then the SAMPLE_COUNT pass is ensured to be added. */
+  void update_passes(Scene *scene, bool add_sample_count_pass);
+
+  uint get_kernel_features(const Scene *scene) const;
+
+ private:
+  void add_auto_pass(Scene *scene, PassType type, const char *name = nullptr);
+  void add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name = nullptr);
+  void remove_auto_passes(Scene *scene);
+  void finalize_passes(Scene *scene, const bool use_denoise);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
index 7ec1d2d9abb..6804a006fe6 100644
--- a/intern/cycles/render/geometry.cpp
+++ b/intern/cycles/render/geometry.cpp
@@ -215,6 +215,12 @@ void Geometry::compute_bvh(
       msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
 
     Object object;
+
+    /* Ensure all visibility bits are set at the geometry level BVH. In
+     * the object level BVH is where actual visibility is tested. */
+    object.set_is_shadow_catcher(true);
+    object.set_visibility(~0);
+
     object.set_geometry(this);
 
     vector<Geometry *> geometry;
@@ -315,7 +321,7 @@ void GeometryManager::update_osl_attributes(Device *device,
 {
 #ifdef WITH_OSL
   /* for OSL, a hash map is used to lookup the attribute by name. */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   og->object_name_map.clear();
   og->attribute_map.clear();
@@ -1855,8 +1861,8 @@ void GeometryManager::device_update(Device *device,
     });
 
     Camera *dicing_camera = scene->dicing_camera;
-    dicing_camera->set_screen_size_and_resolution(
-        dicing_camera->get_full_width(), dicing_camera->get_full_height(), 1);
+    dicing_camera->set_screen_size(dicing_camera->get_full_width(),
+                                   dicing_camera->get_full_height());
     dicing_camera->update(scene);
 
     size_t i = 0;
@@ -2157,7 +2163,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc
   dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
 
 #ifdef WITH_OSL
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   if (og) {
     og->object_name_map.clear();
diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/render/gpu_display.cpp
new file mode 100644
index 00000000000..a8f0cc50583
--- /dev/null
+++ b/intern/cycles/render/gpu_display.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/gpu_display.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void GPUDisplay::reset(const BufferParams &buffer_params)
+{
+  thread_scoped_lock lock(mutex_);
+
+  const GPUDisplayParams old_params = params_;
+
+  params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y);
+  params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
+  params_.size = make_int2(buffer_params.width, buffer_params.height);
+
+  /* If the parameters did change tag texture as unusable. This avoids drawing old texture content
+   * in an updated configuration of the viewport. For example, avoids drawing old frame when render
+   * border did change.
+   * If the parameters did not change, allow drawing the current state of the texture, which will
+   * not count as an up-to-date redraw. This will avoid flickering when doping camera navigation by
+   * showing a previously rendered frame for until the new one is ready. */
+  if (old_params.modified(params_)) {
+    texture_state_.is_usable = false;
+  }
+
+  texture_state_.is_outdated = true;
+}
+
+void GPUDisplay::mark_texture_updated()
+{
+  texture_state_.is_outdated = false;
+  texture_state_.is_usable = true;
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool GPUDisplay::update_begin(int texture_width, int texture_height)
+{
+  DCHECK(!update_state_.is_active);
+
+  if (update_state_.is_active) {
+    LOG(ERROR) << "Attempt to re-activate update process.";
+    return false;
+  }
+
+  /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+   * The update itself is non-blocking however, for better performance and to avoid
+   * potential deadlocks due to locks held by the subclass. */
+  GPUDisplayParams params;
+  {
+    thread_scoped_lock lock(mutex_);
+    params = params_;
+    texture_state_.size = make_int2(texture_width, texture_height);
+  }
+
+  if (!do_update_begin(params, texture_width, texture_height)) {
+    LOG(ERROR) << "GPUDisplay implementation could not begin update.";
+    return false;
+  }
+
+  update_state_.is_active = true;
+
+  return true;
+}
+
+void GPUDisplay::update_end()
+{
+  DCHECK(update_state_.is_active);
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to deactivate inactive update process.";
+    return;
+  }
+
+  do_update_end();
+
+  update_state_.is_active = false;
+}
+
+int2 GPUDisplay::get_texture_size() const
+{
+  return texture_state_.size;
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void GPUDisplay::copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+  DCHECK(update_state_.is_active);
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    return;
+  }
+
+  mark_texture_updated();
+  do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height);
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *GPUDisplay::map_texture_buffer()
+{
+  DCHECK(!texture_buffer_state_.is_mapped);
+  DCHECK(update_state_.is_active);
+
+  if (texture_buffer_state_.is_mapped) {
+    LOG(ERROR) << "Attempt to re-map an already mapped texture buffer.";
+    return nullptr;
+  }
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    return nullptr;
+  }
+
+  half4 *mapped_rgba_pixels = do_map_texture_buffer();
+
+  if (mapped_rgba_pixels) {
+    texture_buffer_state_.is_mapped = true;
+  }
+
+  return mapped_rgba_pixels;
+}
+
+void GPUDisplay::unmap_texture_buffer()
+{
+  DCHECK(texture_buffer_state_.is_mapped);
+
+  if (!texture_buffer_state_.is_mapped) {
+    LOG(ERROR) << "Attempt to unmap non-mapped texture buffer.";
+    return;
+  }
+
+  texture_buffer_state_.is_mapped = false;
+
+  mark_texture_updated();
+  do_unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get()
+{
+  DCHECK(!texture_buffer_state_.is_mapped);
+  DCHECK(update_state_.is_active);
+
+  if (texture_buffer_state_.is_mapped) {
+    LOG(ERROR)
+        << "Attempt to use graphics interoperability mode while the texture buffer is mapped.";
+    return DeviceGraphicsInteropDestination();
+  }
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update.";
+    return DeviceGraphicsInteropDestination();
+  }
+
+  /* Assume that interop will write new values to the texture. */
+  mark_texture_updated();
+
+  return do_graphics_interop_get();
+}
+
+void GPUDisplay::graphics_interop_activate()
+{
+}
+
+void GPUDisplay::graphics_interop_deactivate()
+{
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+bool GPUDisplay::draw()
+{
+  /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+   * The drawing itself is non-blocking however, for better performance and to avoid
+   * potential deadlocks due to locks held by the subclass. */
+  GPUDisplayParams params;
+  bool is_usable;
+  bool is_outdated;
+
+  {
+    thread_scoped_lock lock(mutex_);
+    params = params_;
+    is_usable = texture_state_.is_usable;
+    is_outdated = texture_state_.is_outdated;
+  }
+
+  if (is_usable) {
+    do_draw(params);
+  }
+
+  return !is_outdated;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/render/gpu_display.h
new file mode 100644
index 00000000000..cbe347895a1
--- /dev/null
+++ b/intern/cycles/render/gpu_display.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_graphics_interop.h"
+#include "util/util_half.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored
+ * in a GPU-side texture, which is updated from a path tracer and drawn by an application.
+ *
+ * The base GPUDisplay does some special texture state tracking, which allows render Session to
+ * make decisions on whether reset for an updated state is possible or not. This state should only
+ * be tracked in a base class and a particular implementation should not worry about it.
+ *
+ * The subclasses should only implement the pure virtual methods, which allows them to not worry
+ * about parent method calls, which helps them to be as small and reliable as possible. */
+
+class GPUDisplayParams {
+ public:
+  /* Offset of the display within a viewport.
+   * For example, set to a lower-bottom corner of border render in Blender's viewport. */
+  int2 offset = make_int2(0, 0);
+
+  /* Full viewport size.
+   *
+   * NOTE: Is not affected by the resolution divider. */
+  int2 full_size = make_int2(0, 0);
+
+  /* Effective vieport size.
+   * In the case of border render, size of the border rectangle.
+   *
+   * NOTE: Is not affected by the resolution divider. */
+  int2 size = make_int2(0, 0);
+
+  bool modified(const GPUDisplayParams &other) const
+  {
+    return !(offset == other.offset && full_size == other.full_size && size == other.size);
+  }
+};
+
+class GPUDisplay {
+ public:
+  GPUDisplay() = default;
+  virtual ~GPUDisplay() = default;
+
+  /* Reset the display for the new state of render session. Is called whenever session is reset,
+   * which happens on changes like viewport navigation or viewport dimension change.
+   *
+   * This call will configure parameters for a changed buffer and reset the texture state. */
+  void reset(const BufferParams &buffer_params);
+
+  const GPUDisplayParams &get_params() const
+  {
+    return params_;
+  }
+
+  /* --------------------------------------------------------------------
+   * Update procedure.
+   *
+   * These calls indicates a desire of the caller to update content of the displayed texture. */
+
+  /* Returns true when update is ready. Update should be finished with update_end().
+   *
+   * If false is returned then no update is possible, and no update_end() call is needed.
+   *
+   * The texture width and height denotes an actual resolution of the underlying render result. */
+  bool update_begin(int texture_width, int texture_height);
+
+  void update_end();
+
+  /* Get currently configured texture size of the display (as configured by `update_begin()`. */
+  int2 get_texture_size() const;
+
+  /* --------------------------------------------------------------------
+   * Texture update from CPU buffer.
+   *
+   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   *
+   * Most portable implementation, which must be supported by all platforms. Might not be the most
+   * efficient one.
+   */
+
+  /* Copy buffer of rendered pixels of a given size into a given position of the texture.
+   *
+   * This function does not acquire a lock. The reason for this is is to allow use of this function
+   * for partial updates from different devices. In this case the caller will acquire the lock
+   * once, update all the slices and release
+   * the lock once. This will ensure that draw() will never use partially updated texture. */
+  void copy_pixels_to_texture(
+      const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height);
+
+  /* --------------------------------------------------------------------
+   * Texture buffer mapping.
+   *
+   * This functionality is used to update GPU-side texture content without need to maintain CPU
+   * side buffer on the caller.
+   *
+   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   *
+   * NOTE: Texture buffer can not be mapped while graphics interopeability is active. This means
+   * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and
+   * `graphics_interop_end()` calls.
+   */
+
+  /* Map pixels memory form texture to a buffer available for write from CPU. Width and height will
+   * define a requested size of the texture to write to.
+   * Upon success a non-null pointer is returned and the texture buffer is to be unmapped.
+   * If an error happens during mapping, or if mapoping is not supported by this GPU display a
+   * null pointer is returned and the buffer is NOT to be unmapped.
+   *
+   * NOTE: Usually the implementation will rely on a GPU context of some sort, and the GPU context
+   * is often can not be bound to two threads simultaneously, and can not be released from a
+   * different thread. This means that the mapping API should be used from the single thread only,
+   */
+  half4 *map_texture_buffer();
+  void unmap_texture_buffer();
+
+  /* --------------------------------------------------------------------
+   * Graphics interoperability.
+   *
+   * A special code path which allows to update texture content directly from the GPU compute
+   * device. Complementary part of DeviceGraphicsInterop.
+   *
+   * NOTE: Graphics interoperability can not be used while the texture buffer is mapped. This means
+   * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and
+   * `unmap_texture_buffer()` calls. */
+
+  /* Get GPUDisplay graphics interoperability information which acts as a destination for the
+   * device API. */
+  DeviceGraphicsInteropDestination graphics_interop_get();
+
+  /* (De)activate GPU display for graphics interoperability outside of regular display udpate
+   * routines. */
+  virtual void graphics_interop_activate();
+  virtual void graphics_interop_deactivate();
+
+  /* --------------------------------------------------------------------
+   * Drawing.
+   */
+
+  /* Clear the texture by filling it with all zeroes.
+   *
+   * This call might happen in parallel with draw, but can never happen in parallel with the
+   * update.
+   *
+   * The actual zero-ing can be deferred to a later moment. What is important is that after clear
+   * and before pixels update the drawing texture will be fully empty, and that partial update
+   * after clear will write new pixel values for an updating area, leaving everything else zeroed.
+   *
+   * If the GPU display supports graphics interoperability then the zeroing the display is to be
+   * delegated to the device via the `DeviceGraphicsInteropDestination`. */
+  virtual void clear() = 0;
+
+  /* Draw the current state of the texture.
+   *
+   * Returns true if this call did draw an updated state of the texture. */
+  bool draw();
+
+ protected:
+  /* Implementation-specific calls which subclasses are to implement.
+   * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to
+   * simplify their particular implementation. */
+  virtual bool do_update_begin(const GPUDisplayParams &params,
+                               int texture_width,
+                               int texture_height) = 0;
+  virtual void do_update_end() = 0;
+
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) = 0;
+
+  virtual half4 *do_map_texture_buffer() = 0;
+  virtual void do_unmap_texture_buffer() = 0;
+
+  /* Note that this might be called in parallel to do_update_begin() and do_update_end(),
+   * the subclass is responsible for appropriate mutex locks to avoid multiple threads
+   * editing and drawing the texture at the same time. */
+  virtual void do_draw(const GPUDisplayParams &params) = 0;
+
+  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0;
+
+ private:
+  thread_mutex mutex_;
+  GPUDisplayParams params_;
+
+  /* Mark texture as its content has been updated.
+   * Used from places which knows that the texture content has been brough up-to-date, so that the
+   * drawing knows whether it can be performed, and whether drawing happenned with an up-to-date
+   * texture state. */
+  void mark_texture_updated();
+
+  /* State of the update process. */
+  struct {
+    /* True when update is in process, indicated by `update_begin()` / `update_end()`. */
+    bool is_active = false;
+  } update_state_;
+
+  /* State of the texture, which is needed for an integration with render session and interactive
+   * updates and navigation. */
+  struct {
+    /* Denotes whether possibly existing state of GPU side texture is still usable.
+     * It will not be usable in cases like render border did change (in this case we don't want
+     * previous texture to be rendered at all).
+     *
+     * However, if only navigation or object in scene did change, then the outdated state of the
+     * texture is still usable for draw, preventing display viewport flickering on navigation and
+     * object modifications. */
+    bool is_usable = false;
+
+    /* Texture is considered outdated after `reset()` until the next call of
+     * `copy_pixels_to_texture()`. */
+    bool is_outdated = true;
+
+    /* Texture size in pixels. */
+    int2 size = make_int2(0, 0);
+  } texture_state_;
+
+  /* State of the texture buffer. Is tracked to perform sanity checks. */
+  struct {
+    /* True when the texture buffer is mapped with `map_texture_buffer()`. */
+    bool is_mapped = false;
+  } texture_buffer_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 5102b182593..3584754fad1 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -224,10 +224,6 @@ class ShaderNode : public Node {
   {
     return false;
   }
-  virtual bool has_raytrace()
-  {
-    return false;
-  }
   vector<ShaderInput *> inputs;
   vector<ShaderOutput *> outputs;
 
@@ -242,22 +238,13 @@ class ShaderNode : public Node {
    * that those functions are for selective compilation only?
    */
 
-  /* Nodes are split into several groups, group of level 0 contains
-   * nodes which are most commonly used, further levels are extension
-   * of previous one and includes less commonly used nodes.
-   */
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_0;
-  }
-
   /* Node feature are used to disable huge nodes inside the group,
    * so it's possible to disable huge nodes inside of the required
    * nodes group.
    */
   virtual int get_feature()
   {
-    return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP;
+    return bump == SHADER_BUMP_NONE ? 0 : KERNEL_FEATURE_NODE_BUMP;
   }
 
   /* Get closure ID to which the node compiles into. */
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index d8749cec9fa..d74d14242bb 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -53,6 +53,8 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
 
   SOCKET_INT(ao_bounces, "AO Bounces", 0);
+  SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
+  SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
 
   SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024);
   SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
@@ -66,33 +68,39 @@ NODE_DEFINE(Integrator)
   SOCKET_BOOLEAN(motion_blur, "Motion Blur", false);
 
   SOCKET_INT(aa_samples, "AA Samples", 0);
-  SOCKET_INT(diffuse_samples, "Diffuse Samples", 1);
-  SOCKET_INT(glossy_samples, "Glossy Samples", 1);
-  SOCKET_INT(transmission_samples, "Transmission Samples", 1);
-  SOCKET_INT(ao_samples, "AO Samples", 1);
-  SOCKET_INT(mesh_light_samples, "Mesh Light Samples", 1);
-  SOCKET_INT(subsurface_samples, "Subsurface Samples", 1);
-  SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
   SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
   SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
 
-  SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
-  SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
 
-  static NodeEnum method_enum;
-  method_enum.insert("path", PATH);
-  method_enum.insert("branched_path", BRANCHED_PATH);
-  SOCKET_ENUM(method, "Method", method_enum, PATH);
-
   static NodeEnum sampling_pattern_enum;
   sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL);
-  sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ);
   sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ);
   SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL);
 
+  static NodeEnum denoiser_type_enum;
+  denoiser_type_enum.insert("optix", DENOISER_OPTIX);
+  denoiser_type_enum.insert("openimagedenoise", DENOISER_OPENIMAGEDENOISE);
+
+  static NodeEnum denoiser_prefilter_enum;
+  denoiser_prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+  denoiser_prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+  denoiser_prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+
+  /* Default to accurate denoising with OpenImageDenoise. For interactive viewport
+   * it's best use OptiX and disable the normal pass since it does not always have
+   * the desired effect for that denoiser. */
+  SOCKET_BOOLEAN(use_denoise, "Use Denoiser", false);
+  SOCKET_ENUM(denoiser_type, "Denoiser Type", denoiser_type_enum, DENOISER_OPENIMAGEDENOISE);
+  SOCKET_INT(denoise_start_sample, "Start Sample to Denoise", 0);
+  SOCKET_BOOLEAN(use_denoise_pass_albedo, "Use Albedo Pass for Denoiser", true);
+  SOCKET_BOOLEAN(use_denoise_pass_normal, "Use Normal Pass for Denoiser", true);
+  SOCKET_ENUM(
+      denoiser_prefilter, "Denoiser Type", denoiser_prefilter_enum, DENOISER_PREFILTER_ACCURATE);
+
   return type;
 }
 
@@ -115,13 +123,20 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
     }
   });
 
-  const bool need_update_lut = ao_samples_is_modified() || diffuse_samples_is_modified() ||
-                               glossy_samples_is_modified() || max_bounce_is_modified() ||
-                               max_transmission_bounce_is_modified() ||
-                               mesh_light_samples_is_modified() || method_is_modified() ||
-                               sampling_pattern_is_modified() ||
-                               subsurface_samples_is_modified() ||
-                               transmission_samples_is_modified() || volume_samples_is_modified();
+  KernelIntegrator *kintegrator = &dscene->data.integrator;
+
+  /* Adaptive sampling requires PMJ samples.
+   *
+   * This also makes detection of sampling pattern a bit more involved: can not rely on the changed
+   * state of socket, since its value might be different from the effective value used here. So
+   * instead compare with previous value in the KernelIntegrator. Only do it if the device was
+   * updated once (in which case the `sample_pattern_lut` will be allocated to a non-zero size). */
+  const SamplingPattern new_sampling_pattern = (use_adaptive_sampling) ? SAMPLING_PATTERN_PMJ :
+                                                                         sampling_pattern;
+
+  const bool need_update_lut = max_bounce_is_modified() || max_transmission_bounce_is_modified() ||
+                               dscene->sample_pattern_lut.size() == 0 ||
+                               kintegrator->sampling_pattern != new_sampling_pattern;
 
   if (need_update_lut) {
     dscene->sample_pattern_lut.tag_realloc();
@@ -129,8 +144,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   device_free(device, dscene);
 
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
-
   /* integrator parameters */
   kintegrator->min_bounce = min_bounce + 1;
   kintegrator->max_bounce = max_bounce + 1;
@@ -143,12 +156,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
   kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
 
-  if (ao_bounces == 0) {
-    kintegrator->ao_bounces = INT_MAX;
-  }
-  else {
-    kintegrator->ao_bounces = ao_bounces - 1;
-  }
+  kintegrator->ao_bounces = ao_bounces;
+  kintegrator->ao_bounces_distance = ao_distance;
+  kintegrator->ao_bounces_factor = ao_factor;
 
   /* Transparent Shadows
    * We only need to enable transparent shadows, if we actually have
@@ -171,10 +181,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->caustics_refractive = caustics_refractive;
   kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;
 
-  kintegrator->seed = hash_uint2(seed, 0);
-
-  kintegrator->use_ambient_occlusion = ((Pass::contains(scene->passes, PASS_AO)) ||
-                                        dscene->data.background.ao_factor != 0.0f);
+  kintegrator->seed = seed;
 
   kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX :
                                                                      sample_clamp_direct * 3.0f;
@@ -182,51 +189,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
                                            FLT_MAX :
                                            sample_clamp_indirect * 3.0f;
 
-  kintegrator->branched = (method == BRANCHED_PATH) && device->info.has_branched_path;
-  kintegrator->volume_decoupled = device->info.has_volume_decoupled;
-  kintegrator->diffuse_samples = diffuse_samples;
-  kintegrator->glossy_samples = glossy_samples;
-  kintegrator->transmission_samples = transmission_samples;
-  kintegrator->ao_samples = ao_samples;
-  kintegrator->mesh_light_samples = mesh_light_samples;
-  kintegrator->subsurface_samples = subsurface_samples;
-  kintegrator->volume_samples = volume_samples;
-  kintegrator->start_sample = start_sample;
-
-  if (kintegrator->branched) {
-    kintegrator->sample_all_lights_direct = sample_all_lights_direct;
-    kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
-  }
-  else {
-    kintegrator->sample_all_lights_direct = false;
-    kintegrator->sample_all_lights_indirect = false;
-  }
-
-  kintegrator->sampling_pattern = sampling_pattern;
-  kintegrator->aa_samples = aa_samples;
-  if (aa_samples > 0 && adaptive_min_samples == 0) {
-    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
-    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
-            << kintegrator->adaptive_min_samples;
-  }
-  else {
-    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
-  }
-
-  kintegrator->adaptive_step = 4;
-  kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample;
-
-  /* Adaptive step must be a power of two for bitwise operations to work. */
-  assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0);
-
-  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
-    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
-    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
-            << kintegrator->adaptive_threshold;
-  }
-  else {
-    kintegrator->adaptive_threshold = adaptive_threshold;
-  }
+  kintegrator->sampling_pattern = new_sampling_pattern;
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
@@ -236,29 +199,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   }
 
   /* sobol directions table */
-  int max_samples = 1;
-
-  if (kintegrator->branched) {
-    foreach (Light *light, scene->lights)
-      max_samples = max(max_samples, light->get_samples());
-
-    max_samples = max(max_samples,
-                      max(diffuse_samples, max(glossy_samples, transmission_samples)));
-    max_samples = max(max_samples, max(ao_samples, max(mesh_light_samples, subsurface_samples)));
-    max_samples = max(max_samples, volume_samples);
-  }
-
-  uint total_bounces = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
-                       max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
-
-  max_samples *= total_bounces;
+  int max_samples = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
+                    max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
 
   int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM;
   dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
 
   if (need_update_lut) {
-    if (sampling_pattern == SAMPLING_PATTERN_SOBOL) {
-      uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
+    if (kintegrator->sampling_pattern == SAMPLING_PATTERN_SOBOL) {
+      uint *directions = (uint *)dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
 
       sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
 
@@ -276,10 +225,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
             function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j));
       }
       pool.wait_work();
+
       dscene->sample_pattern_lut.copy_to_device();
     }
   }
 
+  kintegrator->has_shadow_catcher = scene->has_shadow_catcher();
+
   dscene->sample_pattern_lut.clear_modified();
   clear_modified();
 }
@@ -295,17 +247,12 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
     tag_modified();
   }
 
-  if (flag & (AO_PASS_MODIFIED | BACKGROUND_AO_MODIFIED)) {
+  if (flag & AO_PASS_MODIFIED) {
     /* tag only the ao_bounces socket as modified so we avoid updating sample_pattern_lut
      * unnecessarily */
     tag_ao_bounces_modified();
   }
 
-  if ((flag & LIGHT_SAMPLES_MODIFIED) && (method == BRANCHED_PATH)) {
-    /* the number of light samples may affect the size of the sample_pattern_lut */
-    tag_sampling_pattern_modified();
-  }
-
   if (filter_glossy_is_modified()) {
     foreach (Shader *shader, scene->shaders) {
       if (shader->has_integrator_dependency) {
@@ -321,4 +268,65 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
   }
 }
 
+AdaptiveSampling Integrator::get_adaptive_sampling() const
+{
+  AdaptiveSampling adaptive_sampling;
+
+  adaptive_sampling.use = use_adaptive_sampling;
+
+  if (!adaptive_sampling.use) {
+    return adaptive_sampling;
+  }
+
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    adaptive_sampling.threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = " << adaptive_sampling.threshold;
+  }
+  else {
+    adaptive_sampling.threshold = adaptive_threshold;
+  }
+
+  if (adaptive_sampling.threshold > 0 && adaptive_min_samples == 0) {
+    /* Threshold 0.1 -> 32, 0.01 -> 64, 0.001 -> 128.
+     * This is highly scene dependent, we make a guess that seemed to work well
+     * in various test scenes. */
+    const int min_samples = (int)ceilf(16.0f / powf(adaptive_sampling.threshold, 0.3f));
+    adaptive_sampling.min_samples = max(4, min_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << adaptive_sampling.min_samples;
+  }
+  else {
+    adaptive_sampling.min_samples = max(4, adaptive_min_samples);
+  }
+
+  /* Arbitrary factor that makes the threshold more similar to what is was before,
+   * and gives arguably more intuitive values. */
+  adaptive_sampling.threshold *= 5.0f;
+
+  adaptive_sampling.adaptive_step = 16;
+
+  DCHECK(is_power_of_two(adaptive_sampling.adaptive_step))
+      << "Adaptive step must be a power of two for bitwise operations to work";
+
+  return adaptive_sampling;
+}
+
+DenoiseParams Integrator::get_denoise_params() const
+{
+  DenoiseParams denoise_params;
+
+  denoise_params.use = use_denoise;
+
+  denoise_params.type = denoiser_type;
+
+  denoise_params.start_sample = denoise_start_sample;
+
+  denoise_params.use_pass_albedo = use_denoise_pass_albedo;
+  denoise_params.use_pass_normal = use_denoise_pass_normal;
+
+  denoise_params.prefilter = denoiser_prefilter;
+
+  return denoise_params;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 4eeeda92d41..32e108d62ca 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,7 +19,9 @@
 
 #include "kernel/kernel_types.h"
 
+#include "device/device_denoise.h" /* For the paramaters and type enum. */
 #include "graph/node.h"
+#include "integrator/adaptive_sampling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,6 +45,8 @@ class Integrator : public Node {
   NODE_SOCKET_API(int, transparent_max_bounce)
 
   NODE_SOCKET_API(int, ao_bounces)
+  NODE_SOCKET_API(float, ao_factor)
+  NODE_SOCKET_API(float, ao_distance)
 
   NODE_SOCKET_API(int, volume_max_steps)
   NODE_SOCKET_API(float, volume_step_rate)
@@ -62,37 +66,26 @@ class Integrator : public Node {
   static const int MAX_SAMPLES = (1 << 24);
 
   NODE_SOCKET_API(int, aa_samples)
-  NODE_SOCKET_API(int, diffuse_samples)
-  NODE_SOCKET_API(int, glossy_samples)
-  NODE_SOCKET_API(int, transmission_samples)
-  NODE_SOCKET_API(int, ao_samples)
-  NODE_SOCKET_API(int, mesh_light_samples)
-  NODE_SOCKET_API(int, subsurface_samples)
-  NODE_SOCKET_API(int, volume_samples)
   NODE_SOCKET_API(int, start_sample)
 
-  NODE_SOCKET_API(bool, sample_all_lights_direct)
-  NODE_SOCKET_API(bool, sample_all_lights_indirect)
   NODE_SOCKET_API(float, light_sampling_threshold)
 
+  NODE_SOCKET_API(bool, use_adaptive_sampling)
   NODE_SOCKET_API(int, adaptive_min_samples)
   NODE_SOCKET_API(float, adaptive_threshold)
 
-  enum Method {
-    BRANCHED_PATH = 0,
-    PATH = 1,
-
-    NUM_METHODS,
-  };
-
-  NODE_SOCKET_API(Method, method)
-
   NODE_SOCKET_API(SamplingPattern, sampling_pattern)
 
+  NODE_SOCKET_API(bool, use_denoise);
+  NODE_SOCKET_API(DenoiserType, denoiser_type);
+  NODE_SOCKET_API(int, denoise_start_sample);
+  NODE_SOCKET_API(bool, use_denoise_pass_albedo);
+  NODE_SOCKET_API(bool, use_denoise_pass_normal);
+  NODE_SOCKET_API(DenoiserPrefilter, denoiser_prefilter);
+
   enum : uint32_t {
     AO_PASS_MODIFIED = (1 << 0),
-    BACKGROUND_AO_MODIFIED = (1 << 1),
-    LIGHT_SAMPLES_MODIFIED = (1 << 2),
+    OBJECT_MANAGER = (1 << 1),
 
     /* tag everything in the manager for an update */
     UPDATE_ALL = ~0u,
@@ -107,6 +100,9 @@ class Integrator : public Node {
   void device_free(Device *device, DeviceScene *dscene, bool force_free = false);
 
   void tag_update(Scene *scene, uint32_t flag);
+
+  AdaptiveSampling get_adaptive_sampling() const;
+  DenoiseParams get_denoise_params() const;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp
index fc47b0e8f0a..e31f8abd446 100644
--- a/intern/cycles/render/jitter.cpp
+++ b/intern/cycles/render/jitter.cpp
@@ -242,12 +242,6 @@ class PMJ02_Generator : public PMJ_Generator {
 
 static void shuffle(float2 points[], int size, int rng_seed)
 {
-  /* Offset samples by 1.0 for faster scrambling in kernel_random.h */
-  for (int i = 0; i < size; ++i) {
-    points[i].x += 1.0f;
-    points[i].y += 1.0f;
-  }
-
   if (rng_seed == 0) {
     return;
   }
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 15aa4e047b5..ae1150fc07b 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include "render/light.h"
 #include "device/device.h"
+
 #include "render/background.h"
 #include "render/film.h"
 #include "render/graph.h"
 #include "render/integrator.h"
+#include "render/light.h"
 #include "render/mesh.h"
 #include "render/nodes.h"
 #include "render/object.h"
@@ -27,6 +28,8 @@
 #include "render/shader.h"
 #include "render/stats.h"
 
+#include "integrator/shader_eval.h"
+
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_logging.h"
@@ -43,63 +46,49 @@ static void shade_background_pixels(Device *device,
                                     vector<float3> &pixels,
                                     Progress &progress)
 {
-  /* create input */
-  device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY);
-  device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE);
-
-  uint4 *d_input_data = d_input.alloc(width * height);
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < width; x++) {
-      float u = (x + 0.5f) / width;
-      float v = (y + 0.5f) / height;
-
-      uint4 in = make_uint4(__float_as_int(u), __float_as_int(v), 0, 0);
-      d_input_data[x + y * width] = in;
-    }
-  }
-
-  /* compute on device */
-  d_output.alloc(width * height);
-  d_output.zero_to_device();
-  d_input.copy_to_device();
-
+  /* Needs to be up to data for attribute access. */
   device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-  DeviceTask main_task(DeviceTask::SHADER);
-  main_task.shader_input = d_input.device_pointer;
-  main_task.shader_output = d_output.device_pointer;
-  main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
-  main_task.shader_x = 0;
-  main_task.shader_w = width * height;
-  main_task.num_samples = 1;
-  main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
-  /* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
-  list<DeviceTask> split_tasks;
-  main_task.split(split_tasks, 1, 128 * 128);
-
-  foreach (DeviceTask &task, split_tasks) {
-    device->task_add(task);
-    device->task_wait();
-    d_output.copy_from_device(task.shader_x, 1, task.shader_w);
-  }
-
-  d_input.free();
-
-  float4 *d_output_data = d_output.data();
-
-  pixels.resize(width * height);
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < width; x++) {
-      pixels[y * width + x].x = d_output_data[y * width + x].x;
-      pixels[y * width + x].y = d_output_data[y * width + x].y;
-      pixels[y * width + x].z = d_output_data[y * width + x].z;
-    }
-  }
+  const int size = width * height;
+  pixels.resize(size);
+
+  /* Evaluate shader on device. */
+  ShaderEval shader_eval(device, progress);
+  shader_eval.eval(
+      SHADER_EVAL_BACKGROUND,
+      size,
+      [&](device_vector<KernelShaderEvalInput> &d_input) {
+        /* Fill coordinates for shading. */
+        KernelShaderEvalInput *d_input_data = d_input.data();
+
+        for (int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            float u = (x + 0.5f) / width;
+            float v = (y + 0.5f) / height;
+
+            KernelShaderEvalInput in;
+            in.object = OBJECT_NONE;
+            in.prim = PRIM_NONE;
+            in.u = u;
+            in.v = v;
+            d_input_data[x + y * width] = in;
+          }
+        }
 
-  d_output.free();
+        return size;
+      },
+      [&](device_vector<float4> &d_output) {
+        /* Copy output to pixel buffer. */
+        float4 *d_output_data = d_output.data();
+
+        for (int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            pixels[y * width + x].x = d_output_data[y * width + x].x;
+            pixels[y * width + x].y = d_output_data[y * width + x].y;
+            pixels[y * width + x].z = d_output_data[y * width + x].z;
+          }
+        }
+      });
 }
 
 /* Light */
@@ -140,15 +129,16 @@ NODE_DEFINE(Light)
 
   SOCKET_BOOLEAN(cast_shadow, "Cast Shadow", true);
   SOCKET_BOOLEAN(use_mis, "Use Mis", false);
+  SOCKET_BOOLEAN(use_camera, "Use Camera", true);
   SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
   SOCKET_BOOLEAN(use_glossy, "Use Glossy", true);
   SOCKET_BOOLEAN(use_transmission, "Use Transmission", true);
   SOCKET_BOOLEAN(use_scatter, "Use Scatter", true);
 
-  SOCKET_INT(samples, "Samples", 1);
   SOCKET_INT(max_bounces, "Max Bounces", 1024);
   SOCKET_UINT(random_id, "Random ID", 0);
 
+  SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", true);
   SOCKET_BOOLEAN(is_portal, "Is Portal", false);
   SOCKET_BOOLEAN(is_enabled, "Is Enabled", true);
 
@@ -166,10 +156,6 @@ void Light::tag_update(Scene *scene)
 {
   if (is_modified()) {
     scene->light_manager->tag_update(scene, LightManager::LIGHT_MODIFIED);
-
-    if (samples_is_modified()) {
-      scene->integrator->tag_update(scene, Integrator::LIGHT_SAMPLES_MODIFIED);
-    }
   }
 }
 
@@ -193,7 +179,6 @@ LightManager::LightManager()
 {
   update_flags = UPDATE_ALL;
   need_update_background = true;
-  use_light_visibility = false;
   last_background_enabled = false;
   last_background_resolution = 0;
 }
@@ -357,21 +342,23 @@ void LightManager::device_update_distribution(Device *,
     int object_id = j;
     int shader_flag = 0;
 
+    if (!(object->get_visibility() & PATH_RAY_CAMERA)) {
+      shader_flag |= SHADER_EXCLUDE_CAMERA;
+    }
     if (!(object->get_visibility() & PATH_RAY_DIFFUSE)) {
       shader_flag |= SHADER_EXCLUDE_DIFFUSE;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_GLOSSY)) {
       shader_flag |= SHADER_EXCLUDE_GLOSSY;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_TRANSMIT)) {
       shader_flag |= SHADER_EXCLUDE_TRANSMIT;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_VOLUME_SCATTER)) {
       shader_flag |= SHADER_EXCLUDE_SCATTER;
-      use_light_visibility = true;
+    }
+    if (!(object->get_is_shadow_catcher())) {
+      shader_flag |= SHADER_EXCLUDE_SHADOW_CATCHER;
     }
 
     size_t mesh_num_triangles = mesh->num_triangles();
@@ -496,10 +483,10 @@ void LightManager::device_update_distribution(Device *,
     kfilm->pass_shadow_scale = 1.0f;
 
     if (kintegrator->pdf_triangles != 0.0f)
-      kfilm->pass_shadow_scale *= 0.5f;
+      kfilm->pass_shadow_scale /= 0.5f;
 
     if (num_background_lights < num_lights)
-      kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights) / (float)num_lights;
+      kfilm->pass_shadow_scale /= (float)(num_lights - num_background_lights) / (float)num_lights;
 
     /* CDF */
     dscene->light_distribution.copy_to_device();
@@ -766,25 +753,26 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
     if (!light->cast_shadow)
       shader_id &= ~SHADER_CAST_SHADOW;
 
+    if (!light->use_camera) {
+      shader_id |= SHADER_EXCLUDE_CAMERA;
+    }
     if (!light->use_diffuse) {
       shader_id |= SHADER_EXCLUDE_DIFFUSE;
-      use_light_visibility = true;
     }
     if (!light->use_glossy) {
       shader_id |= SHADER_EXCLUDE_GLOSSY;
-      use_light_visibility = true;
     }
     if (!light->use_transmission) {
       shader_id |= SHADER_EXCLUDE_TRANSMIT;
-      use_light_visibility = true;
     }
     if (!light->use_scatter) {
       shader_id |= SHADER_EXCLUDE_SCATTER;
-      use_light_visibility = true;
+    }
+    if (!light->is_shadow_catcher) {
+      shader_id |= SHADER_EXCLUDE_SHADOW_CATCHER;
     }
 
     klights[light_index].type = light->light_type;
-    klights[light_index].samples = light->samples;
     klights[light_index].strength[0] = light->strength.x;
     klights[light_index].strength[1] = light->strength.y;
     klights[light_index].strength[2] = light->strength.z;
@@ -836,19 +824,15 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
 
       if (!(visibility & PATH_RAY_DIFFUSE)) {
         shader_id |= SHADER_EXCLUDE_DIFFUSE;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_GLOSSY)) {
         shader_id |= SHADER_EXCLUDE_GLOSSY;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_TRANSMIT)) {
         shader_id |= SHADER_EXCLUDE_TRANSMIT;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_VOLUME_SCATTER)) {
         shader_id |= SHADER_EXCLUDE_SCATTER;
-        use_light_visibility = true;
       }
     }
     else if (light->light_type == LIGHT_AREA) {
@@ -998,8 +982,6 @@ void LightManager::device_update(Device *device,
 
   device_free(device, dscene, need_update_background);
 
-  use_light_visibility = false;
-
   device_update_points(device, dscene, scene);
   if (progress.get_cancel())
     return;
@@ -1018,8 +1000,6 @@ void LightManager::device_update(Device *device,
   if (progress.get_cancel())
     return;
 
-  scene->film->set_use_light_visibility(use_light_visibility);
-
   update_flags = UPDATE_NONE;
   need_update_background = false;
 }
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index fbd709125ff..7f86237c8b3 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -69,16 +69,17 @@ class Light : public Node {
 
   NODE_SOCKET_API(bool, cast_shadow)
   NODE_SOCKET_API(bool, use_mis)
+  NODE_SOCKET_API(bool, use_camera)
   NODE_SOCKET_API(bool, use_diffuse)
   NODE_SOCKET_API(bool, use_glossy)
   NODE_SOCKET_API(bool, use_transmission)
   NODE_SOCKET_API(bool, use_scatter)
 
+  NODE_SOCKET_API(bool, is_shadow_catcher)
   NODE_SOCKET_API(bool, is_portal)
   NODE_SOCKET_API(bool, is_enabled)
 
   NODE_SOCKET_API(Shader *, shader)
-  NODE_SOCKET_API(int, samples)
   NODE_SOCKET_API(int, max_bounces)
   NODE_SOCKET_API(uint, random_id)
 
@@ -108,8 +109,6 @@ class LightManager {
     UPDATE_NONE = 0u,
   };
 
-  bool use_light_visibility;
-
   /* Need to update background (including multiple importance map) */
   bool need_update_background;
 
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index b39d81023d9..c00c4c24211 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -16,6 +16,8 @@
 
 #include "device/device.h"
 
+#include "integrator/shader_eval.h"
+
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -43,40 +45,28 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts)
   return norm / normlen;
 }
 
-bool GeometryManager::displace(
-    Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+/* Fill in coordinates for mesh displacement shader evaluation on device. */
+static int fill_shader_input(const Scene *scene,
+                             const Mesh *mesh,
+                             const int object_index,
+                             device_vector<KernelShaderEvalInput> &d_input)
 {
-  /* verify if we have a displacement shader */
-  if (!mesh->has_true_displacement()) {
-    return false;
-  }
-
-  string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
-  progress.set_status("Updating Mesh", msg);
+  int d_input_size = 0;
+  KernelShaderEvalInput *d_input_data = d_input.data();
 
-  /* find object index. todo: is arbitrary */
-  size_t object_index = OBJECT_NONE;
+  const array<int> &mesh_shaders = mesh->get_shader();
+  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+  const array<float3> &mesh_verts = mesh->get_verts();
 
-  for (size_t i = 0; i < scene->objects.size(); i++) {
-    if (scene->objects[i]->get_geometry() == mesh) {
-      object_index = i;
-      break;
-    }
-  }
-
-  /* setup input for device task */
-  const size_t num_verts = mesh->verts.size();
+  const int num_verts = mesh_verts.size();
   vector<bool> done(num_verts, false);
-  device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY);
-  uint4 *d_input_data = d_input.alloc(num_verts);
-  size_t d_input_size = 0;
 
-  size_t num_triangles = mesh->num_triangles();
-  for (size_t i = 0; i < num_triangles; i++) {
+  int num_triangles = mesh->num_triangles();
+  for (int i = 0; i < num_triangles; i++) {
     Mesh::Triangle t = mesh->get_triangle(i);
-    int shader_index = mesh->shader[i];
-    Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-                         static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+    int shader_index = mesh_shaders[i];
+    Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+                         static_cast<Shader *>(mesh_used_shaders[shader_index]) :
                          scene->default_surface;
 
     if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -110,57 +100,41 @@ bool GeometryManager::displace(
       }
 
       /* back */
-      uint4 in = make_uint4(object, prim, __float_as_int(u), __float_as_int(v));
+      KernelShaderEvalInput in;
+      in.object = object;
+      in.prim = prim;
+      in.u = u;
+      in.v = v;
       d_input_data[d_input_size++] = in;
     }
   }
 
-  if (d_input_size == 0)
-    return false;
-
-  /* run device task */
-  device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE);
-  d_output.alloc(d_input_size);
-  d_output.zero_to_device();
-  d_input.copy_to_device();
-
-  /* needs to be up to data for attribute access */
-  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
-
-  DeviceTask task(DeviceTask::SHADER);
-  task.shader_input = d_input.device_pointer;
-  task.shader_output = d_output.device_pointer;
-  task.shader_eval_type = SHADER_EVAL_DISPLACE;
-  task.shader_x = 0;
-  task.shader_w = d_output.size();
-  task.num_samples = 1;
-  task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
-  device->task_add(task);
-  device->task_wait();
-
-  if (progress.get_cancel()) {
-    d_input.free();
-    d_output.free();
-    return false;
-  }
+  return d_input_size;
+}
 
-  d_output.copy_from_device(0, 1, d_output.size());
-  d_input.free();
+/* Read back mesh displacement shader output. */
+static void read_shader_output(const Scene *scene,
+                               Mesh *mesh,
+                               const device_vector<float4> &d_output)
+{
+  const array<int> &mesh_shaders = mesh->get_shader();
+  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+  array<float3> &mesh_verts = mesh->get_verts();
 
-  /* read result */
-  done.clear();
-  done.resize(num_verts, false);
-  int k = 0;
+  const int num_verts = mesh_verts.size();
+  const int num_motion_steps = mesh->get_motion_steps();
+  vector<bool> done(num_verts, false);
 
-  float4 *offset = d_output.data();
+  const float4 *d_output_data = d_output.data();
+  int d_output_index = 0;
 
   Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-  for (size_t i = 0; i < num_triangles; i++) {
+  int num_triangles = mesh->num_triangles();
+  for (int i = 0; i < num_triangles; i++) {
     Mesh::Triangle t = mesh->get_triangle(i);
-    int shader_index = mesh->shader[i];
-    Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-                         static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+    int shader_index = mesh_shaders[i];
+    Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+                         static_cast<Shader *>(mesh_used_shaders[shader_index]) :
                          scene->default_surface;
 
     if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -170,12 +144,12 @@ bool GeometryManager::displace(
     for (int j = 0; j < 3; j++) {
       if (!done[t.v[j]]) {
         done[t.v[j]] = true;
-        float3 off = float4_to_float3(offset[k++]);
+        float3 off = float4_to_float3(d_output_data[d_output_index++]);
         /* Avoid illegal vertex coordinates. */
         off = ensure_finite3(off);
-        mesh->verts[t.v[j]] += off;
+        mesh_verts[t.v[j]] += off;
         if (attr_mP != NULL) {
-          for (int step = 0; step < mesh->motion_steps - 1; step++) {
+          for (int step = 0; step < num_motion_steps - 1; step++) {
             float3 *mP = attr_mP->data_float3() + step * num_verts;
             mP[t.v[j]] += off;
           }
@@ -183,8 +157,47 @@ bool GeometryManager::displace(
       }
     }
   }
+}
 
-  d_output.free();
+bool GeometryManager::displace(
+    Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+{
+  /* verify if we have a displacement shader */
+  if (!mesh->has_true_displacement()) {
+    return false;
+  }
+
+  const size_t num_verts = mesh->verts.size();
+  const size_t num_triangles = mesh->num_triangles();
+
+  if (num_triangles == 0) {
+    return false;
+  }
+
+  string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
+  progress.set_status("Updating Mesh", msg);
+
+  /* find object index. todo: is arbitrary */
+  size_t object_index = OBJECT_NONE;
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    if (scene->objects[i]->get_geometry() == mesh) {
+      object_index = i;
+      break;
+    }
+  }
+
+  /* Needs to be up to data for attribute access. */
+  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
+  /* Evaluate shader on device. */
+  ShaderEval shader_eval(device, progress);
+  if (!shader_eval.eval(SHADER_EVAL_DISPLACE,
+                        num_verts,
+                        function_bind(&fill_shader_input, scene, mesh, object_index, _1),
+                        function_bind(&read_shader_output, scene, mesh, _1))) {
+    return false;
+  }
 
   /* stitch */
   unordered_set<int> stitch_keys;
@@ -297,8 +310,7 @@ bool GeometryManager::displace(
     }
 
     /* normalize vertex normals */
-    done.clear();
-    done.resize(num_verts, false);
+    vector<bool> done(num_verts, false);
 
     for (size_t i = 0; i < num_triangles; i++) {
       if (tri_has_true_disp[i]) {
@@ -368,8 +380,7 @@ bool GeometryManager::displace(
         }
 
         /* normalize vertex normals */
-        done.clear();
-        done.resize(num_verts, false);
+        vector<bool> done(num_verts, false);
 
         for (size_t i = 0; i < num_triangles; i++) {
           if (tri_has_true_disp[i]) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 795166bcf4c..5303d55242e 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2736,18 +2736,21 @@ NODE_DEFINE(PrincipledBsdfNode)
       distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
 
   static NodeEnum subsurface_method_enum;
-  subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID);
-  subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+  subsurface_method_enum.insert("random_walk_fixed_radius",
+                                CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+  subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
   SOCKET_ENUM(subsurface_method,
               "Subsurface Method",
               subsurface_method_enum,
-              CLOSURE_BSSRDF_PRINCIPLED_ID);
+              CLOSURE_BSSRDF_RANDOM_WALK_ID);
 
   SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
   SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
   SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
   SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
   SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+  SOCKET_IN_FLOAT(subsurface_ior, "Subsurface IOR", 1.4f);
+  SOCKET_IN_FLOAT(subsurface_anisotropy, "Subsurface Anisotropy", 0.0f);
   SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
   SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
   SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
@@ -2857,6 +2860,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
                                  ShaderInput *p_metallic,
                                  ShaderInput *p_subsurface,
                                  ShaderInput *p_subsurface_radius,
+                                 ShaderInput *p_subsurface_ior,
+                                 ShaderInput *p_subsurface_anisotropy,
                                  ShaderInput *p_specular,
                                  ShaderInput *p_roughness,
                                  ShaderInput *p_specular_tint,
@@ -2896,6 +2901,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
   int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
   int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
   int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+  int subsurface_ior_offset = compiler.stack_assign(p_subsurface_ior);
+  int subsurface_anisotropy_offset = compiler.stack_assign(p_subsurface_anisotropy);
 
   compiler.add_node(NODE_CLOSURE_BSDF,
                     compiler.encode_uchar4(closure,
@@ -2929,8 +2936,10 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
       __float_as_int(bc_default.y),
       __float_as_int(bc_default.z));
 
-  compiler.add_node(
-      clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+  compiler.add_node(clearcoat_normal_offset,
+                    subsurface_radius_offset,
+                    subsurface_ior_offset,
+                    subsurface_anisotropy_offset);
 
   float3 ss_default = get_float3(subsurface_color_in->socket_type);
 
@@ -2953,6 +2962,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler)
           input("Metallic"),
           input("Subsurface"),
           input("Subsurface Radius"),
+          input("Subsurface IOR"),
+          input("Subsurface Anisotropy"),
           input("Specular"),
           input("Roughness"),
           input("Specular Tint"),
@@ -3048,16 +3059,16 @@ NODE_DEFINE(SubsurfaceScatteringNode)
   SOCKET_IN_NORMAL(normal, "Normal", zero_float3(), SocketType::LINK_NORMAL);
   SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
-  static NodeEnum falloff_enum;
-  falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID);
-  falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID);
-  falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID);
-  falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
-  SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID);
+  static NodeEnum method_enum;
+  method_enum.insert("random_walk_fixed_radius", CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+  method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
+  SOCKET_ENUM(method, "Method", method_enum, CLOSURE_BSSRDF_RANDOM_WALK_ID);
+
   SOCKET_IN_FLOAT(scale, "Scale", 0.01f);
   SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f));
-  SOCKET_IN_FLOAT(sharpness, "Sharpness", 0.0f);
-  SOCKET_IN_FLOAT(texture_blur, "Texture Blur", 1.0f);
+
+  SOCKET_IN_FLOAT(subsurface_ior, "IOR", 1.4f);
+  SOCKET_IN_FLOAT(subsurface_anisotropy, "Anisotropy", 0.0f);
 
   SOCKET_OUT_CLOSURE(BSSRDF, "BSSRDF");
 
@@ -3066,20 +3077,19 @@ NODE_DEFINE(SubsurfaceScatteringNode)
 
 SubsurfaceScatteringNode::SubsurfaceScatteringNode() : BsdfNode(get_node_type())
 {
-  closure = falloff;
+  closure = method;
 }
 
 void SubsurfaceScatteringNode::compile(SVMCompiler &compiler)
 {
-  closure = falloff;
-  BsdfNode::compile(
-      compiler, input("Scale"), input("Texture Blur"), input("Radius"), input("Sharpness"));
+  closure = method;
+  BsdfNode::compile(compiler, input("Scale"), input("IOR"), input("Radius"), input("Anisotropy"));
 }
 
 void SubsurfaceScatteringNode::compile(OSLCompiler &compiler)
 {
-  closure = falloff;
-  compiler.parameter(this, "falloff");
+  closure = method;
+  compiler.parameter(this, "method");
   compiler.add(this, "node_subsurface_scattering");
 }
 
@@ -3786,20 +3796,6 @@ void GeometryNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_geometry");
 }
 
-int GeometryNode::get_group()
-{
-  ShaderOutput *out;
-  int result = ShaderNode::get_group();
-
-  /* Backfacing uses NODE_LIGHT_PATH */
-  out = output("Backfacing");
-  if (!out->links.empty()) {
-    result = max(result, NODE_GROUP_LEVEL_1);
-  }
-
-  return result;
-}
-
 /* TextureCoordinate */
 
 NODE_DEFINE(TextureCoordinateNode)
@@ -5926,33 +5922,33 @@ NODE_DEFINE(OutputAOVNode)
 OutputAOVNode::OutputAOVNode() : ShaderNode(get_node_type())
 {
   special_type = SHADER_SPECIAL_TYPE_OUTPUT_AOV;
-  slot = -1;
+  offset = -1;
 }
 
 void OutputAOVNode::simplify_settings(Scene *scene)
 {
-  slot = scene->film->get_aov_offset(scene, name.string(), is_color);
-  if (slot == -1) {
-    slot = scene->film->get_aov_offset(scene, name.string(), is_color);
+  offset = scene->film->get_aov_offset(scene, name.string(), is_color);
+  if (offset == -1) {
+    offset = scene->film->get_aov_offset(scene, name.string(), is_color);
   }
 
-  if (slot == -1 || is_color) {
+  if (offset == -1 || is_color) {
     input("Value")->disconnect();
   }
-  if (slot == -1 || !is_color) {
+  if (offset == -1 || !is_color) {
     input("Color")->disconnect();
   }
 }
 
 void OutputAOVNode::compile(SVMCompiler &compiler)
 {
-  assert(slot >= 0);
+  assert(offset >= 0);
 
   if (is_color) {
-    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), slot);
+    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), offset);
   }
   else {
-    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), slot);
+    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), offset);
   }
 }
 
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 3013e9b1866..22bdb06b059 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -143,10 +143,6 @@ class EnvironmentTextureNode : public ImageSlotTextureNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   virtual bool equals(const ShaderNode &other)
   {
@@ -170,11 +166,6 @@ class SkyTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(SkyTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeSkyType, sky_type)
   NODE_SOCKET_API(float3, sun_direction)
   NODE_SOCKET_API(float, turbidity)
@@ -224,18 +215,13 @@ class OutputAOVNode : public ShaderNode {
 
   NODE_SOCKET_API(ustring, name)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_4;
-  }
-
   /* Don't allow output node de-duplication. */
   virtual bool equals(const ShaderNode & /*other*/)
   {
     return false;
   }
 
-  int slot;
+  int offset;
   bool is_color;
 };
 
@@ -243,11 +229,6 @@ class GradientTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(GradientTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeGradientType, gradient_type)
   NODE_SOCKET_API(float3, vector)
 };
@@ -269,19 +250,14 @@ class VoronoiTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(VoronoiTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   virtual int get_feature()
   {
     int result = ShaderNode::get_feature();
     if (dimensions == 4) {
-      result |= NODE_FEATURE_VORONOI_EXTRA;
+      result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
     }
     else if (dimensions >= 2 && feature == NODE_VORONOI_SMOOTH_F1) {
-      result |= NODE_FEATURE_VORONOI_EXTRA;
+      result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
     }
     return result;
   }
@@ -301,11 +277,6 @@ class MusgraveTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(MusgraveTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(int, dimensions)
   NODE_SOCKET_API(NodeMusgraveType, musgrave_type)
   NODE_SOCKET_API(float, w)
@@ -322,11 +293,6 @@ class WaveTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(WaveTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeWaveType, wave_type)
   NODE_SOCKET_API(NodeWaveBandsDirection, bands_direction)
   NODE_SOCKET_API(NodeWaveRingsDirection, rings_direction)
@@ -345,11 +311,6 @@ class MagicTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(MagicTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(int, depth)
   NODE_SOCKET_API(float3, vector)
   NODE_SOCKET_API(float, scale)
@@ -364,11 +325,6 @@ class CheckerTextureNode : public TextureNode {
   NODE_SOCKET_API(float3, color1)
   NODE_SOCKET_API(float3, color2)
   NODE_SOCKET_API(float, scale)
-
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class BrickTextureNode : public TextureNode {
@@ -390,20 +346,11 @@ class BrickTextureNode : public TextureNode {
   NODE_SOCKET_API(float, brick_width)
   NODE_SOCKET_API(float, row_height)
   NODE_SOCKET_API(float3, vector)
-
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class PointDensityTextureNode : public ShaderNode {
  public:
   SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_4;
-  }
 
   ~PointDensityTextureNode();
   ShaderNode *clone(ShaderGraph *graph) const;
@@ -443,10 +390,6 @@ class IESLightNode : public TextureNode {
 
   ~IESLightNode();
   ShaderNode *clone(ShaderGraph *graph) const;
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(ustring, filename)
   NODE_SOCKET_API(ustring, ies)
@@ -464,10 +407,6 @@ class IESLightNode : public TextureNode {
 class WhiteNoiseTextureNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(WhiteNoiseTextureNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(int, dimensions)
   NODE_SOCKET_API(float3, vector)
@@ -477,10 +416,6 @@ class WhiteNoiseTextureNode : public ShaderNode {
 class MappingNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MappingNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
   void constant_fold(const ConstantFolder &folder);
 
   NODE_SOCKET_API(float3, vector)
@@ -546,6 +481,11 @@ class BsdfBaseNode : public ShaderNode {
     return false;
   }
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_BSDF;
+  }
+
  protected:
   ClosureType closure;
 };
@@ -606,6 +546,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
                ShaderInput *metallic,
                ShaderInput *subsurface,
                ShaderInput *subsurface_radius,
+               ShaderInput *subsurface_ior,
+               ShaderInput *subsurface_anisotropy,
                ShaderInput *specular,
                ShaderInput *roughness,
                ShaderInput *specular_tint,
@@ -622,6 +564,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
   NODE_SOCKET_API(float3, base_color)
   NODE_SOCKET_API(float3, subsurface_color)
   NODE_SOCKET_API(float3, subsurface_radius)
+  NODE_SOCKET_API(float, subsurface_ior)
+  NODE_SOCKET_API(float, subsurface_anisotropy)
   NODE_SOCKET_API(float, metallic)
   NODE_SOCKET_API(float, subsurface)
   NODE_SOCKET_API(float, specular)
@@ -758,14 +702,14 @@ class SubsurfaceScatteringNode : public BsdfNode {
   bool has_bssrdf_bump();
   ClosureType get_closure_type()
   {
-    return falloff;
+    return method;
   }
 
   NODE_SOCKET_API(float, scale)
   NODE_SOCKET_API(float3, radius)
-  NODE_SOCKET_API(float, sharpness)
-  NODE_SOCKET_API(float, texture_blur)
-  NODE_SOCKET_API(ClosureType, falloff)
+  NODE_SOCKET_API(float, subsurface_ior)
+  NODE_SOCKET_API(float, subsurface_anisotropy)
+  NODE_SOCKET_API(ClosureType, method)
 };
 
 class EmissionNode : public ShaderNode {
@@ -782,6 +726,11 @@ class EmissionNode : public ShaderNode {
     return true;
   }
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+  }
+
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, surface_mix_weight)
@@ -792,6 +741,11 @@ class BackgroundNode : public ShaderNode {
   SHADER_NODE_CLASS(BackgroundNode)
   void constant_fold(const ConstantFolder &folder);
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+  }
+
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, surface_mix_weight)
@@ -800,10 +754,6 @@ class BackgroundNode : public ShaderNode {
 class HoldoutNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(HoldoutNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual ClosureType get_closure_type()
   {
     return CLOSURE_HOLDOUT_ID;
@@ -821,13 +771,9 @@ class AmbientOcclusionNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-  virtual bool has_raytrace()
+  virtual int get_feature()
   {
-    return true;
+    return KERNEL_FEATURE_NODE_RAYTRACE;
   }
 
   NODE_SOCKET_API(float3, color)
@@ -845,13 +791,9 @@ class VolumeNode : public ShaderNode {
   SHADER_NODE_BASE_CLASS(VolumeNode)
 
   void compile(SVMCompiler &compiler, ShaderInput *param1, ShaderInput *param2);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual int get_feature()
   {
-    return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_VOLUME;
   }
   virtual ClosureType get_closure_type()
   {
@@ -1013,10 +955,6 @@ class UVMapNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(ustring, attribute)
   NODE_SOCKET_API(bool, from_dupli)
@@ -1025,10 +963,6 @@ class UVMapNode : public ShaderNode {
 class LightPathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(LightPathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class LightFalloffNode : public ShaderNode {
@@ -1038,10 +972,6 @@ class LightFalloffNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, smooth)
@@ -1050,10 +980,6 @@ class LightFalloffNode : public ShaderNode {
 class ObjectInfoNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ObjectInfoNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class ParticleInfoNode : public ShaderNode {
@@ -1064,10 +990,6 @@ class ParticleInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class HairInfoNode : public ShaderNode {
@@ -1083,13 +1005,9 @@ class HairInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual int get_feature()
   {
-    return ShaderNode::get_feature() | NODE_FEATURE_HAIR;
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_HAIR;
   }
 };
 
@@ -1168,10 +1086,6 @@ class InvertNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(InvertNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, fac)
   NODE_SOCKET_API(float3, color)
@@ -1182,11 +1096,6 @@ class MixNode : public ShaderNode {
   SHADER_NODE_CLASS(MixNode)
   void constant_fold(const ConstantFolder &folder);
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API(NodeMix, mix_type)
   NODE_SOCKET_API(bool, use_clamp)
   NODE_SOCKET_API(float3, color1)
@@ -1198,10 +1107,6 @@ class CombineRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineRGBNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, r)
   NODE_SOCKET_API(float, g)
@@ -1212,10 +1117,6 @@ class CombineHSVNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineHSVNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, h)
   NODE_SOCKET_API(float, s)
@@ -1226,10 +1127,6 @@ class CombineXYZNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineXYZNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, x)
   NODE_SOCKET_API(float, y)
@@ -1240,10 +1137,6 @@ class GammaNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(GammaNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, gamma)
@@ -1253,10 +1146,6 @@ class BrightContrastNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(BrightContrastNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, bright)
@@ -1267,10 +1156,6 @@ class SeparateRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateRGBNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, color)
 };
@@ -1279,10 +1164,6 @@ class SeparateHSVNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateHSVNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, color)
 };
@@ -1291,10 +1172,6 @@ class SeparateXYZNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateXYZNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, vector)
 };
@@ -1333,10 +1210,6 @@ class CameraNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class FresnelNode : public ShaderNode {
@@ -1346,10 +1219,6 @@ class FresnelNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, normal)
   NODE_SOCKET_API(float, IOR)
@@ -1362,10 +1231,6 @@ class LayerWeightNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, normal)
   NODE_SOCKET_API(float, blend)
@@ -1378,10 +1243,6 @@ class WireframeNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, size)
   NODE_SOCKET_API(bool, use_pixel_size)
@@ -1390,10 +1251,6 @@ class WireframeNode : public ShaderNode {
 class WavelengthNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(WavelengthNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, wavelength)
 };
@@ -1402,10 +1259,6 @@ class BlackbodyNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(BlackbodyNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, temperature)
 };
@@ -1413,10 +1266,6 @@ class BlackbodyNode : public ShaderNode {
 class MapRangeNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MapRangeNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   void expand(ShaderGraph *graph);
 
   NODE_SOCKET_API(float, value)
@@ -1433,10 +1282,6 @@ class ClampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ClampNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   NODE_SOCKET_API(float, value)
   NODE_SOCKET_API(float, min)
   NODE_SOCKET_API(float, max)
@@ -1446,10 +1291,6 @@ class ClampNode : public ShaderNode {
 class MathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   void expand(ShaderGraph *graph);
   void constant_fold(const ConstantFolder &folder);
 
@@ -1463,10 +1304,6 @@ class MathNode : public ShaderNode {
 class NormalNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(NormalNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(float3, direction)
   NODE_SOCKET_API(float3, normal)
@@ -1475,10 +1312,6 @@ class NormalNode : public ShaderNode {
 class VectorMathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorMathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   void constant_fold(const ConstantFolder &folder);
 
   NODE_SOCKET_API(float3, vector1)
@@ -1492,10 +1325,6 @@ class VectorRotateNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorRotateNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   NODE_SOCKET_API(NodeVectorRotateType, rotate_type)
   NODE_SOCKET_API(bool, invert)
   NODE_SOCKET_API(float3, vector)
@@ -1509,11 +1338,6 @@ class VectorTransformNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorTransformNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API(NodeVectorTransformType, transform_type)
   NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_from)
   NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_to)
@@ -1530,7 +1354,7 @@ class BumpNode : public ShaderNode {
   }
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(bool, invert)
@@ -1549,11 +1373,6 @@ class CurvesNode : public ShaderNode {
   explicit CurvesNode(const NodeType *node_type);
   SHADER_NODE_BASE_CLASS(CurvesNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API_ARRAY(array<float3>, curves)
   NODE_SOCKET_API(float, min_x)
   NODE_SOCKET_API(float, max_x)
@@ -1583,10 +1402,6 @@ class RGBRampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(RGBRampNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API_ARRAY(array<float3>, ramp)
   NODE_SOCKET_API_ARRAY(array<float>, ramp_alpha)
@@ -1656,10 +1471,6 @@ class NormalMapNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
   NODE_SOCKET_API(ustring, attribute)
@@ -1680,10 +1491,6 @@ class TangentNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(NodeTangentDirectionType, direction_type)
   NODE_SOCKET_API(NodeTangentAxis, axis)
@@ -1698,13 +1505,9 @@ class BevelNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-  virtual bool has_raytrace()
+  virtual int get_feature()
   {
-    return true;
+    return KERNEL_FEATURE_NODE_RAYTRACE;
   }
 
   NODE_SOCKET_API(float, radius)
@@ -1718,7 +1521,7 @@ class DisplacementNode : public ShaderNode {
   void constant_fold(const ConstantFolder &folder);
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
@@ -1739,7 +1542,7 @@ class VectorDisplacementNode : public ShaderNode {
   void constant_fold(const ConstantFolder &folder);
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index c88d94fe4c2..4637f8fe989 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -216,6 +216,10 @@ void Object::tag_update(Scene *scene)
     if (use_holdout_is_modified()) {
       flag |= ObjectManager::HOLDOUT_MODIFIED;
     }
+
+    if (is_shadow_catcher_is_modified()) {
+      scene->tag_shadow_catcher_modified();
+    }
   }
 
   if (geometry) {
@@ -273,14 +277,7 @@ bool Object::is_traceable() const
 
 uint Object::visibility_for_tracing() const
 {
-  uint trace_visibility = visibility;
-  if (is_shadow_catcher) {
-    trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER;
-  }
-  else {
-    trace_visibility &= ~PATH_RAY_SHADOW_CATCHER;
-  }
-  return trace_visibility;
+  return SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility & PATH_RAY_ALL_VISIBILITY);
 }
 
 float Object::compute_volume_step_size() const
@@ -680,7 +677,7 @@ void ObjectManager::device_update(Device *device,
 
   /* prepare for static BVH building */
   /* todo: do before to support getting object level coords? */
-  if (scene->params.bvh_type == SceneParams::BVH_STATIC) {
+  if (scene->params.bvh_type == BVH_TYPE_STATIC) {
     scoped_callback_timer timer([scene](double time) {
       if (scene->update_stats) {
         scene->update_stats->object.times.add_entry(
@@ -932,6 +929,11 @@ void ObjectManager::tag_update(Scene *scene, uint32_t flag)
   }
 
   scene->light_manager->tag_update(scene, LightManager::OBJECT_MANAGER);
+
+  /* Integrator's shadow catcher settings depends on object visibility settings. */
+  if (flag & (OBJECT_ADDED | OBJECT_REMOVED | OBJECT_MODIFIED)) {
+    scene->integrator->tag_update(scene, Integrator::OBJECT_MANAGER);
+  }
 }
 
 bool ObjectManager::need_update() const
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 7dc79f48145..d28b222c10e 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -113,7 +113,7 @@ void OSLShaderManager::device_update_specific(Device *device,
   scene->image_manager->set_osl_texture_system((void *)ts);
 
   /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
   Shader *background_shader = scene->background->get_shader(scene);
 
   foreach (Shader *shader, scene->shaders) {
@@ -174,7 +174,7 @@ void OSLShaderManager::device_update_specific(Device *device,
 
 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   device_free_common(device, dscene, scene);
 
@@ -257,25 +257,36 @@ void OSLShaderManager::shading_system_init()
 
     /* our own ray types */
     static const char *raytypes[] = {
-        "camera",      /* PATH_RAY_CAMERA */
-        "reflection",  /* PATH_RAY_REFLECT */
-        "refraction",  /* PATH_RAY_TRANSMIT */
-        "diffuse",     /* PATH_RAY_DIFFUSE */
-        "glossy",      /* PATH_RAY_GLOSSY */
-        "singular",    /* PATH_RAY_SINGULAR */
-        "transparent", /* PATH_RAY_TRANSPARENT */
-
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */
-
-        "__unused__",  "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
-        "__unused__",
-
-        "__unused__",  "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-        "__unused__",  "__unused__",       "__unused__", "__unused__",
-        "__unused__",  "__unused__",       "__unused__",
+        "camera",         /* PATH_RAY_CAMERA */
+        "reflection",     /* PATH_RAY_REFLECT */
+        "refraction",     /* PATH_RAY_TRANSMIT */
+        "diffuse",        /* PATH_RAY_DIFFUSE */
+        "glossy",         /* PATH_RAY_GLOSSY */
+        "singular",       /* PATH_RAY_SINGULAR */
+        "transparent",    /* PATH_RAY_TRANSPARENT */
+        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+        "__unused__", /* PATH_RAY_MIS_SKIP */
+
+        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+        "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */
+        "__unused__", /* PATH_RAY_TRANSPARENT_BACKGROUND */
+        "__unused__", /* PATH_RAY_TERMINATE_IMMEDIATE */
+        "__unused__", /* PATH_RAY_TERMINATE_AFTER_TRANSPARENT */
+        "__unused__", /* PATH_RAY_EMISSION */
+        "__unused__", /* PATH_RAY_SUBSURFACE */
+        "__unused__", /* PATH_RAY_DENOISING_FEATURES */
+        "__unused__", /* PATH_RAY_REFLECT_PASS */
+        "__unused__", /* PATH_RAY_TRANSMISSION_PASS */
+        "__unused__", /* PATH_RAY_VOLUME_PASS */
+        "__unused__", /* PATH_RAY_SHADOW_FOR_LIGHT */
+        "__unused__", /* PATH_RAY_SHADOW_CATCHER_HIT */
+        "__unused__", /* PATH_RAY_SHADOW_CATCHER_PASS */
     };
 
     const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
@@ -758,7 +769,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
         current_shader->has_surface_bssrdf = true;
         current_shader->has_bssrdf_bump = true; /* can't detect yet */
       }
-      current_shader->has_bump = true; /* can't detect yet */
+      current_shader->has_bump = true;             /* can't detect yet */
+      current_shader->has_surface_raytrace = true; /* can't detect yet */
     }
 
     if (node->has_spatial_varying()) {
@@ -1054,6 +1066,8 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet &nodes)
               current_shader->has_surface_emission = true;
             if (node->has_surface_transparent())
               current_shader->has_surface_transparent = true;
+            if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+              current_shader->has_surface_raytrace = true;
             if (node->has_spatial_varying())
               current_shader->has_surface_spatial_varying = true;
             if (node->has_surface_bssrdf()) {
diff --git a/intern/cycles/render/pass.cpp b/intern/cycles/render/pass.cpp
new file mode 100644
index 00000000000..27ad7c0db97
--- /dev/null
+++ b/intern/cycles/render/pass.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/pass.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type)
+{
+  const int type_int = static_cast<int>(type);
+
+  const NodeEnum *type_enum = Pass::get_type_enum();
+
+  if (!type_enum->exists(type_int)) {
+    LOG(DFATAL) << "Unhandled pass type " << static_cast<int>(type) << ", not supposed to happen.";
+    return "UNKNOWN";
+  }
+
+  return (*type_enum)[type_int].c_str();
+}
+
+const char *pass_mode_as_string(PassMode mode)
+{
+  switch (mode) {
+    case PassMode::NOISY:
+      return "NOISY";
+    case PassMode::DENOISED:
+      return "DENOISED";
+  }
+
+  LOG(DFATAL) << "Unhandled pass mode " << static_cast<int>(mode) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, PassMode mode)
+{
+  os << pass_mode_as_string(mode);
+  return os;
+}
+
+const NodeEnum *Pass::get_type_enum()
+{
+  static NodeEnum pass_type_enum;
+
+  if (pass_type_enum.empty()) {
+
+    /* Light Passes. */
+    pass_type_enum.insert("combined", PASS_COMBINED);
+    pass_type_enum.insert("emission", PASS_EMISSION);
+    pass_type_enum.insert("background", PASS_BACKGROUND);
+    pass_type_enum.insert("ao", PASS_AO);
+    pass_type_enum.insert("shadow", PASS_SHADOW);
+    pass_type_enum.insert("diffuse", PASS_DIFFUSE);
+    pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
+    pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
+    pass_type_enum.insert("glossy", PASS_GLOSSY);
+    pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
+    pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
+    pass_type_enum.insert("transmission", PASS_TRANSMISSION);
+    pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
+    pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
+    pass_type_enum.insert("volume", PASS_VOLUME);
+    pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
+    pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
+
+    /* Data passes. */
+    pass_type_enum.insert("depth", PASS_DEPTH);
+    pass_type_enum.insert("position", PASS_POSITION);
+    pass_type_enum.insert("normal", PASS_NORMAL);
+    pass_type_enum.insert("roughness", PASS_ROUGHNESS);
+    pass_type_enum.insert("uv", PASS_UV);
+    pass_type_enum.insert("object_id", PASS_OBJECT_ID);
+    pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
+    pass_type_enum.insert("motion", PASS_MOTION);
+    pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
+    pass_type_enum.insert("render_time", PASS_RENDER_TIME);
+    pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
+    pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
+    pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
+    pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
+    pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
+    pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
+    pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
+    pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
+    pass_type_enum.insert("mist", PASS_MIST);
+    pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL);
+    pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
+
+    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
+    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+    pass_type_enum.insert("shadow_catcher_matte", PASS_SHADOW_CATCHER_MATTE);
+
+    pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
+    pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
+  }
+
+  return &pass_type_enum;
+}
+
+const NodeEnum *Pass::get_mode_enum()
+{
+  static NodeEnum pass_mode_enum;
+
+  if (pass_mode_enum.empty()) {
+    pass_mode_enum.insert("noisy", static_cast<int>(PassMode::NOISY));
+    pass_mode_enum.insert("denoised", static_cast<int>(PassMode::DENOISED));
+  }
+
+  return &pass_mode_enum;
+}
+
+NODE_DEFINE(Pass)
+{
+  NodeType *type = NodeType::add("pass", create);
+
+  const NodeEnum *pass_type_enum = get_type_enum();
+  const NodeEnum *pass_mode_enum = get_mode_enum();
+
+  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+  SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+  SOCKET_STRING(name, "Name", ustring());
+  SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
+
+  return type;
+}
+
+Pass::Pass() : Node(get_node_type()), is_auto_(false)
+{
+}
+
+PassInfo Pass::get_info() const
+{
+  return get_info(type, include_albedo);
+}
+
+bool Pass::is_written() const
+{
+  return get_info().is_written;
+}
+
+PassInfo Pass::get_info(const PassType type, const bool include_albedo)
+{
+  PassInfo pass_info;
+
+  pass_info.use_filter = true;
+  pass_info.use_exposure = false;
+  pass_info.divide_type = PASS_NONE;
+  pass_info.use_compositing = false;
+  pass_info.use_denoising_albedo = true;
+
+  switch (type) {
+    case PASS_NONE:
+      pass_info.num_components = 0;
+      break;
+    case PASS_COMBINED:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = true;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_DEPTH:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;
+    case PASS_MIST:
+      pass_info.num_components = 1;
+      break;
+    case PASS_POSITION:
+      pass_info.num_components = 3;
+      break;
+    case PASS_NORMAL:
+      pass_info.num_components = 3;
+      break;
+    case PASS_ROUGHNESS:
+      pass_info.num_components = 1;
+      break;
+    case PASS_UV:
+      pass_info.num_components = 3;
+      break;
+    case PASS_MOTION:
+      pass_info.num_components = 4;
+      pass_info.divide_type = PASS_MOTION_WEIGHT;
+      break;
+    case PASS_MOTION_WEIGHT:
+      pass_info.num_components = 1;
+      break;
+    case PASS_OBJECT_ID:
+    case PASS_MATERIAL_ID:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;
+
+    case PASS_EMISSION:
+    case PASS_BACKGROUND:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;
+    case PASS_AO:
+      pass_info.num_components = 3;
+      break;
+    case PASS_SHADOW:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = false;
+      break;
+    case PASS_RENDER_TIME:
+      /* This pass is handled entirely on the host side. */
+      pass_info.num_components = 0;
+      break;
+
+    case PASS_DIFFUSE_COLOR:
+    case PASS_GLOSSY_COLOR:
+    case PASS_TRANSMISSION_COLOR:
+      pass_info.num_components = 3;
+      break;
+    case PASS_DIFFUSE:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_DIFFUSE_DIRECT;
+      pass_info.indirect_type = PASS_DIFFUSE_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_DIFFUSE_DIRECT:
+    case PASS_DIFFUSE_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_GLOSSY:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_GLOSSY_DIRECT;
+      pass_info.indirect_type = PASS_GLOSSY_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_GLOSSY_DIRECT:
+    case PASS_GLOSSY_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_TRANSMISSION:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_TRANSMISSION_DIRECT;
+      pass_info.indirect_type = PASS_TRANSMISSION_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_TRANSMISSION_DIRECT:
+    case PASS_TRANSMISSION_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_VOLUME:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_VOLUME_DIRECT;
+      pass_info.indirect_type = PASS_VOLUME_INDIRECT;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_VOLUME_DIRECT:
+    case PASS_VOLUME_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;
+
+    case PASS_CRYPTOMATTE:
+      pass_info.num_components = 4;
+      break;
+
+    case PASS_DENOISING_NORMAL:
+      pass_info.num_components = 3;
+      break;
+    case PASS_DENOISING_ALBEDO:
+      pass_info.num_components = 3;
+      break;
+
+    case PASS_SHADOW_CATCHER:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.use_compositing = true;
+      pass_info.use_denoising_albedo = false;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      break;
+    case PASS_SHADOW_CATCHER_MATTE:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = true;
+      pass_info.support_denoise = true;
+      /* Without shadow catcher approximation compositing is not needed.
+       * Since we don't know here whether approximation is used or not, leave the decision up to
+       * the caller which will know that. */
+      break;
+
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass_info.num_components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      pass_info.use_exposure = false;
+      break;
+
+    case PASS_AOV_COLOR:
+      pass_info.num_components = 3;
+      break;
+    case PASS_AOV_VALUE:
+      pass_info.num_components = 1;
+      break;
+
+    case PASS_BAKE_PRIMITIVE:
+    case PASS_BAKE_DIFFERENTIAL:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = false;
+      pass_info.use_filter = false;
+      break;
+
+    case PASS_CATEGORY_LIGHT_END:
+    case PASS_CATEGORY_DATA_END:
+    case PASS_CATEGORY_BAKE_END:
+    case PASS_NUM:
+      LOG(DFATAL) << "Unexpected pass type is used " << type;
+      pass_info.num_components = 0;
+      break;
+  }
+
+  return pass_info;
+}
+
+bool Pass::contains(const vector<Pass *> &passes, PassType type)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_type() != type) {
+      continue;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, const string &name)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_name() == name) {
+      return pass;
+    }
+  }
+
+  return nullptr;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, PassType type, PassMode mode)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_type() != type || pass->get_mode() != mode) {
+      continue;
+    }
+
+    return pass;
+  }
+
+  return nullptr;
+}
+
+int Pass::get_offset(const vector<Pass *> &passes, const Pass *pass)
+{
+  int pass_offset = 0;
+
+  for (const Pass *current_pass : passes) {
+    /* Note that pass name is allowed to be empty. This is why we check for type and mode. */
+    if (current_pass->get_type() == pass->get_type() &&
+        current_pass->get_mode() == pass->get_mode() &&
+        current_pass->get_name() == pass->get_name()) {
+      if (current_pass->is_written()) {
+        return pass_offset;
+      }
+      else {
+        return PASS_UNUSED;
+      }
+    }
+    if (current_pass->is_written()) {
+      pass_offset += current_pass->get_info().num_components;
+    }
+  }
+
+  return PASS_UNUSED;
+}
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass)
+{
+  os << "type: " << pass_type_as_string(pass.get_type());
+  os << ", name: \"" << pass.get_name() << "\"";
+  os << ", mode: " << pass.get_mode();
+  os << ", is_written: " << string_from_bool(pass.is_written());
+
+  return os;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/pass.h b/intern/cycles/render/pass.h
new file mode 100644
index 00000000000..82230c62cb0
--- /dev/null
+++ b/intern/cycles/render/pass.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>  // NOLINT
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+#include "kernel/kernel_types.h"
+
+#include "graph/node.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type);
+
+enum class PassMode {
+  NOISY,
+  DENOISED,
+};
+const char *pass_mode_as_string(PassMode mode);
+std::ostream &operator<<(std::ostream &os, PassMode mode);
+
+struct PassInfo {
+  int num_components = -1;
+  bool use_filter = false;
+  bool use_exposure = false;
+  bool is_written = true;
+  PassType divide_type = PASS_NONE;
+  PassType direct_type = PASS_NONE;
+  PassType indirect_type = PASS_NONE;
+
+  /* Pass access for read can not happen directly and needs some sort of compositing (for example,
+   * light passes due to divide_type, or shadow catcher pass. */
+  bool use_compositing = false;
+
+  /* Used to disable albedo pass for denoising.
+   * Light and shadow catcher passes should not have discontinuity in the denoised result based on
+   * the underlying albedo. */
+  bool use_denoising_albedo = true;
+
+  /* Pass supports denoising. */
+  bool support_denoise = false;
+};
+
+class Pass : public Node {
+ public:
+  NODE_DECLARE
+
+  NODE_SOCKET_API(PassType, type)
+  NODE_SOCKET_API(PassMode, mode)
+  NODE_SOCKET_API(ustring, name)
+  NODE_SOCKET_API(bool, include_albedo)
+
+  Pass();
+
+  PassInfo get_info() const;
+
+  /* The pass is written by the render pipeline (kernel or denoiser). If the pass is written it
+   * will have pixels allocated in a RenderBuffer. Passes which are not written do not have their
+   * pixels allocated to save memory. */
+  bool is_written() const;
+
+ protected:
+  /* The has been created automatically as a requirement to various rendering functionality (such
+   * as adaptive sampling). */
+  bool is_auto_;
+
+ public:
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_mode_enum();
+
+  static PassInfo get_info(PassType type, const bool include_albedo = false);
+
+  static bool contains(const vector<Pass *> &passes, PassType type);
+
+  /* Returns nullptr if there is no pass with the given name or type+mode. */
+  static const Pass *find(const vector<Pass *> &passes, const string &name);
+  static const Pass *find(const vector<Pass *> &passes,
+                          PassType type,
+                          PassMode mode = PassMode::NOISY);
+
+  /* Returns PASS_UNUSED if there is no corresponding pass. */
+  static int get_offset(const vector<Pass *> &passes, const Pass *pass);
+
+  friend class Film;
+};
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index c4e7d2c79d6..a4b030190dc 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -163,12 +163,15 @@ void Scene::free_memory(bool final)
     delete p;
   foreach (Light *l, lights)
     delete l;
+  foreach (Pass *p, passes)
+    delete p;
 
   geometry.clear();
   objects.clear();
   lights.clear();
   particle_systems.clear();
   procedurals.clear();
+  passes.clear();
 
   if (device) {
     camera->device_free(device, &dscene, this);
@@ -253,7 +256,6 @@ void Scene::device_update(Device *device_, Progress &progress)
    * - Camera may be used for adaptive subdivision.
    * - Displacement shader must have all shader data available.
    * - Light manager needs lookup tables and final mesh data to compute emission CDF.
-   * - Film needs light manager to run for use_light_visibility
    * - Lookup tables are done a second time to handle film tables
    */
 
@@ -469,88 +471,110 @@ void Scene::enable_update_stats()
   }
 }
 
-DeviceRequestedFeatures Scene::get_requested_device_features()
+void Scene::update_kernel_features()
 {
-  DeviceRequestedFeatures requested_features;
+  if (!need_update()) {
+    return;
+  }
 
-  shader_manager->get_requested_features(this, &requested_features);
+  /* These features are not being tweaked as often as shaders,
+   * so could be done selective magic for the viewport as well. */
+  uint kernel_features = shader_manager->get_kernel_features(this);
 
-  /* This features are not being tweaked as often as shaders,
-   * so could be done selective magic for the viewport as well.
-   */
   bool use_motion = need_motion() == Scene::MotionType::MOTION_BLUR;
-  requested_features.use_hair = false;
-  requested_features.use_hair_thick = (params.hair_shape == CURVE_THICK);
-  requested_features.use_object_motion = false;
-  requested_features.use_camera_motion = use_motion && camera->use_motion();
+  kernel_features |= KERNEL_FEATURE_PATH_TRACING;
+  if (params.hair_shape == CURVE_THICK) {
+    kernel_features |= KERNEL_FEATURE_HAIR_THICK;
+  }
+  if (use_motion && camera->use_motion()) {
+    kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+  }
   foreach (Object *object, objects) {
     Geometry *geom = object->get_geometry();
     if (use_motion) {
-      requested_features.use_object_motion |= object->use_motion() | geom->get_use_motion_blur();
-      requested_features.use_camera_motion |= geom->get_use_motion_blur();
+      if (object->use_motion() || geom->get_use_motion_blur()) {
+        kernel_features |= KERNEL_FEATURE_OBJECT_MOTION;
+      }
+      if (geom->get_use_motion_blur()) {
+        kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+      }
     }
     if (object->get_is_shadow_catcher()) {
-      requested_features.use_shadow_tricks = true;
+      kernel_features |= KERNEL_FEATURE_SHADOW_CATCHER;
     }
     if (geom->is_mesh()) {
       Mesh *mesh = static_cast<Mesh *>(geom);
 #ifdef WITH_OPENSUBDIV
       if (mesh->get_subdivision_type() != Mesh::SUBDIVISION_NONE) {
-        requested_features.use_patch_evaluation = true;
+        kernel_features |= KERNEL_FEATURE_PATCH_EVALUATION;
       }
 #endif
-      requested_features.use_true_displacement |= mesh->has_true_displacement();
     }
     else if (geom->is_hair()) {
-      requested_features.use_hair = true;
+      kernel_features |= KERNEL_FEATURE_HAIR;
     }
   }
 
-  requested_features.use_background_light = light_manager->has_background_light(this);
-
-  requested_features.use_baking = bake_manager->get_baking();
-  requested_features.use_integrator_branched = (integrator->get_method() ==
-                                                Integrator::BRANCHED_PATH);
-  if (film->get_denoising_data_pass()) {
-    requested_features.use_denoising = true;
-    requested_features.use_shadow_tricks = true;
+  if (bake_manager->get_baking()) {
+    kernel_features |= KERNEL_FEATURE_BAKING;
   }
 
-  return requested_features;
-}
+  kernel_features |= film->get_kernel_features(this);
 
-bool Scene::update(Progress &progress, bool &kernel_switch_needed)
-{
-  /* update scene */
-  if (need_update()) {
-    /* Update max_closures. */
-    KernelIntegrator *kintegrator = &dscene.data.integrator;
-    if (params.background) {
-      kintegrator->max_closures = get_max_closure_count();
-    }
-    else {
-      /* Currently viewport render is faster with higher max_closures, needs investigating. */
-      kintegrator->max_closures = MAX_CLOSURE;
-    }
-
-    /* Load render kernels, before device update where we upload data to the GPU. */
-    bool new_kernels_needed = load_kernels(progress, false);
-
-    progress.set_status("Updating Scene");
-    MEM_GUARDED_CALL(&progress, device_update, device, progress);
+  dscene.data.kernel_features = kernel_features;
 
-    DeviceKernelStatus kernel_switch_status = device->get_active_kernel_switch_state();
-    kernel_switch_needed = kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE ||
-                           kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
-    if (new_kernels_needed || kernel_switch_needed) {
-      progress.set_kernel_status("Compiling render kernels");
-      device->wait_for_availability(loaded_kernel_features);
-      progress.set_kernel_status("");
-    }
+  /* Currently viewport render is faster with higher max_closures, needs investigating. */
+  const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE;
+  dscene.data.max_closures = max_closures;
+  dscene.data.max_shaders = shaders.size();
+}
 
-    return true;
+bool Scene::update(Progress &progress)
+{
+  if (!need_update()) {
+    return false;
   }
-  return false;
+
+  /* Load render kernels, before device update where we upload data to the GPU. */
+  load_kernels(progress, false);
+
+  /* Upload scene data to the GPU. */
+  progress.set_status("Updating Scene");
+  MEM_GUARDED_CALL(&progress, device_update, device, progress);
+
+  return true;
+}
+
+static void log_kernel_features(const uint features)
+{
+  VLOG(2) << "Requested features:\n";
+  VLOG(2) << "Use BSDF " << string_from_bool(features & KERNEL_FEATURE_NODE_BSDF) << "\n";
+  VLOG(2) << "Use Principled BSDF " << string_from_bool(features & KERNEL_FEATURE_PRINCIPLED)
+          << "\n";
+  VLOG(2) << "Use Emission " << string_from_bool(features & KERNEL_FEATURE_NODE_EMISSION) << "\n";
+  VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_NODE_VOLUME) << "\n";
+  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_NODE_HAIR) << "\n";
+  VLOG(2) << "Use Bump " << string_from_bool(features & KERNEL_FEATURE_NODE_BUMP) << "\n";
+  VLOG(2) << "Use Voronoi " << string_from_bool(features & KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+          << "\n";
+  VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE)
+          << "\n";
+  VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n";
+  VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n";
+  VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING)
+          << "\n";
+  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_HAIR) << "\n";
+  VLOG(2) << "Use Object Motion " << string_from_bool(features & KERNEL_FEATURE_OBJECT_MOTION)
+          << "\n";
+  VLOG(2) << "Use Camera Motion " << string_from_bool(features & KERNEL_FEATURE_CAMERA_MOTION)
+          << "\n";
+  VLOG(2) << "Use Baking " << string_from_bool(features & KERNEL_FEATURE_BAKING) << "\n";
+  VLOG(2) << "Use Subsurface " << string_from_bool(features & KERNEL_FEATURE_SUBSURFACE) << "\n";
+  VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_VOLUME) << "\n";
+  VLOG(2) << "Use Patch Evaluation "
+          << string_from_bool(features & KERNEL_FEATURE_PATCH_EVALUATION) << "\n";
+  VLOG(2) << "Use Shadow Catcher " << string_from_bool(features & KERNEL_FEATURE_SHADOW_CATCHER)
+          << "\n";
 }
 
 bool Scene::load_kernels(Progress &progress, bool lock_scene)
@@ -560,15 +584,15 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
     scene_lock = thread_scoped_lock(mutex);
   }
 
-  DeviceRequestedFeatures requested_features = get_requested_device_features();
+  const uint kernel_features = dscene.data.kernel_features;
 
-  if (!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
+  if (!kernels_loaded || loaded_kernel_features != kernel_features) {
     progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
     scoped_timer timer;
 
-    VLOG(2) << "Requested features:\n" << requested_features;
-    if (!device->load_kernels(requested_features)) {
+    log_kernel_features(kernel_features);
+    if (!device->load_kernels(kernel_features)) {
       string message = device->error_message();
       if (message.empty())
         message = "Failed loading render kernel, see console for errors";
@@ -580,7 +604,7 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
     }
 
     kernels_loaded = true;
-    loaded_kernel_features = requested_features;
+    loaded_kernel_features = kernel_features;
     return true;
   }
   return false;
@@ -618,6 +642,28 @@ int Scene::get_max_closure_count()
   return max_closure_global;
 }
 
+bool Scene::has_shadow_catcher()
+{
+  if (shadow_catcher_modified_) {
+    has_shadow_catcher_ = false;
+    for (Object *object : objects) {
+      if (object->get_is_shadow_catcher()) {
+        has_shadow_catcher_ = true;
+        break;
+      }
+    }
+
+    shadow_catcher_modified_ = false;
+  }
+
+  return has_shadow_catcher_;
+}
+
+void Scene::tag_shadow_catcher_modified()
+{
+  shadow_catcher_modified_ = true;
+}
+
 template<> Light *Scene::create_node<Light>()
 {
   Light *node = new Light();
@@ -694,6 +740,15 @@ template<> AlembicProcedural *Scene::create_node<AlembicProcedural>()
 #endif
 }
 
+template<> Pass *Scene::create_node<Pass>()
+{
+  Pass *node = new Pass();
+  node->set_owner(this);
+  passes.push_back(node);
+  film->tag_modified();
+  return node;
+}
+
 template<typename T> void delete_node_from_array(vector<T> &nodes, T node)
 {
   for (size_t i = 0; i < nodes.size(); ++i) {
@@ -779,6 +834,12 @@ template<> void Scene::delete_node_impl(AlembicProcedural *node)
 #endif
 }
 
+template<> void Scene::delete_node_impl(Pass *node)
+{
+  delete_node_from_array(passes, node);
+  film->tag_modified();
+}
+
 template<typename T>
 static void remove_nodes_in_set(const set<T *> &nodes_set,
                                 vector<T *> &nodes_array,
@@ -842,4 +903,10 @@ template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOw
   procedural_manager->tag_update();
 }
 
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner)
+{
+  remove_nodes_in_set(nodes, passes, owner);
+  film->tag_modified();
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 7d8a6774381..cf4a3ba6b12 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -128,7 +128,7 @@ class DeviceScene {
   device_vector<float> lookup_table;
 
   /* integrator */
-  device_vector<uint> sample_pattern_lut;
+  device_vector<float> sample_pattern_lut;
 
   /* ies lights */
   device_vector<float> ies_lights;
@@ -142,27 +142,6 @@ class DeviceScene {
 
 class SceneParams {
  public:
-  /* Type of BVH, in terms whether it is supported dynamic updates of meshes
-   * or whether modifying geometry requires full BVH rebuild.
-   */
-  enum BVHType {
-    /* BVH supports dynamic updates of geometry.
-     *
-     * Faster for updating BVH tree when doing modifications in viewport,
-     * but slower for rendering.
-     */
-    BVH_DYNAMIC = 0,
-    /* BVH tree is calculated for specific scene, updates in geometry
-     * requires full tree rebuild.
-     *
-     * Slower to update BVH tree when modifying objects in viewport, also
-     * slower to build final BVH tree but gives best possible render speed.
-     */
-    BVH_STATIC = 1,
-
-    BVH_NUM_TYPES,
-  };
-
   ShadingSystem shadingsystem;
 
   /* Requested BVH layout.
@@ -186,7 +165,7 @@ class SceneParams {
   {
     shadingsystem = SHADINGSYSTEM_SVM;
     bvh_layout = BVH_LAYOUT_BVH2;
-    bvh_type = BVH_DYNAMIC;
+    bvh_type = BVH_TYPE_DYNAMIC;
     use_bvh_spatial_split = false;
     use_bvh_unaligned_nodes = true;
     num_bvh_time_steps = 0;
@@ -196,7 +175,7 @@ class SceneParams {
     background = true;
   }
 
-  bool modified(const SceneParams &params)
+  bool modified(const SceneParams &params) const
   {
     return !(shadingsystem == params.shadingsystem && bvh_layout == params.bvh_layout &&
              bvh_type == params.bvh_type &&
@@ -236,7 +215,7 @@ class Scene : public NodeOwner {
   vector<Shader *> shaders;
   vector<Light *> lights;
   vector<ParticleSystem *> particle_systems;
-  vector<Pass> passes;
+  vector<Pass *> passes;
   vector<Procedural *> procedurals;
 
   /* data managers */
@@ -291,7 +270,11 @@ class Scene : public NodeOwner {
 
   void enable_update_stats();
 
-  bool update(Progress &progress, bool &kernel_switch_needed);
+  void update_kernel_features();
+  bool update(Progress &progress);
+
+  bool has_shadow_catcher();
+  void tag_shadow_catcher_modified();
 
   /* This function is used to create a node of a specified type instead of
    * calling 'new', and sets the scene as the owner of the node.
@@ -348,13 +331,12 @@ class Scene : public NodeOwner {
   void free_memory(bool final);
 
   bool kernels_loaded;
-  DeviceRequestedFeatures loaded_kernel_features;
+  uint loaded_kernel_features;
 
   bool load_kernels(Progress &progress, bool lock_scene = true);
 
-  /* ** Split kernel routines ** */
-
-  DeviceRequestedFeatures get_requested_device_features();
+  bool has_shadow_catcher_ = false;
+  bool shadow_catcher_modified_ = true;
 
   /* Maximum number of closure during session lifetime. */
   int max_closure_global;
@@ -384,6 +366,8 @@ template<> Shader *Scene::create_node<Shader>();
 
 template<> AlembicProcedural *Scene::create_node<AlembicProcedural>();
 
+template<> Pass *Scene::create_node<Pass>();
+
 template<> void Scene::delete_node_impl(Light *node);
 
 template<> void Scene::delete_node_impl(Mesh *node);
@@ -404,6 +388,8 @@ template<> void Scene::delete_node_impl(Procedural *node);
 
 template<> void Scene::delete_node_impl(AlembicProcedural *node);
 
+template<> void Scene::delete_node_impl(Pass *node);
+
 template<> void Scene::delete_nodes(const set<Light *> &nodes, const NodeOwner *owner);
 
 template<> void Scene::delete_nodes(const set<Geometry *> &nodes, const NodeOwner *owner);
@@ -416,6 +402,8 @@ template<> void Scene::delete_nodes(const set<Shader *> &nodes, const NodeOwner
 
 template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOwner *owner);
 
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner);
+
 CCL_NAMESPACE_END
 
 #endif /*  __SCENE_H__ */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 1b91c49f0ea..84407f8e6dd 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,10 +17,15 @@
 #include <limits.h>
 #include <string.h>
 
+#include "device/cpu/device.h"
 #include "device/device.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "integrator/path_trace.h"
+#include "render/background.h"
 #include "render/bake.h"
 #include "render/buffers.h"
 #include "render/camera.h"
+#include "render/gpu_display.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -39,70 +44,63 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note about  preserve_tile_device option for tile manager:
- * progressive refine and viewport rendering does requires tiles to
- * always be allocated for the same device
- */
-Session::Session(const SessionParams &params_)
-    : params(params_),
-      tile_manager(params.progressive,
-                   params.samples,
-                   params.tile_size,
-                   params.start_resolution,
-                   params.background == false || params.progressive_refine,
-                   params.background,
-                   params.tile_order,
-                   max(params.device.multi_devices.size(), 1),
-                   params.pixel_size),
-      stats(),
-      profiler()
+Session::Session(const SessionParams &params_, const SceneParams &scene_params)
+    : params(params_), render_scheduler_(tile_manager_, params)
 {
-  device_use_gl_ = ((params.device.type != DEVICE_CPU) && !params.background);
-
   TaskScheduler::init(params.threads);
 
-  session_thread_ = NULL;
-  scene = NULL;
-
-  reset_time_ = 0.0;
-  last_update_time_ = 0.0;
+  session_thread_ = nullptr;
 
   delayed_reset_.do_reset = false;
-  delayed_reset_.samples = 0;
-
-  display_outdated_ = false;
-  gpu_draw_ready_ = false;
-  gpu_need_display_buffer_update_ = false;
 
   pause_ = false;
   cancel_ = false;
   new_work_added_ = false;
 
-  buffers = NULL;
-  display = NULL;
+  device = Device::create(params.device, stats, profiler);
 
-  /* Validate denoising parameters. */
-  set_denoising(params.denoising);
+  scene = new Scene(scene_params, device);
 
-  /* Create CPU/GPU devices. */
-  device = Device::create(params.device, stats, profiler, params.background);
-
-  if (!device->error_message().empty()) {
-    progress.set_error(device->error_message());
-    return;
-  }
+  /* Configure path tracer. */
+  path_trace_ = make_unique<PathTrace>(
+      device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
+  path_trace_->set_progress(&progress);
+  path_trace_->tile_buffer_update_cb = [&]() {
+    if (!update_render_tile_cb) {
+      return;
+    }
+    update_render_tile_cb();
+  };
+  path_trace_->tile_buffer_write_cb = [&]() {
+    if (!write_render_tile_cb) {
+      return;
+    }
+    write_render_tile_cb();
+  };
+  path_trace_->tile_buffer_read_cb = [&]() -> bool {
+    if (!read_render_tile_cb) {
+      return false;
+    }
+    read_render_tile_cb();
+    return true;
+  };
+  path_trace_->progress_update_cb = [&]() { update_status_time(); };
 
-  /* Create buffers for interactive rendering. */
-  if (!(params.background && !params.write_render_cb)) {
-    buffers = new RenderBuffers(device);
-    display = new DisplayBuffer(device, params.display_buffer_linear);
-  }
+  tile_manager_.full_buffer_written_cb = [&](string_view filename) {
+    if (!full_buffer_written_cb) {
+      return;
+    }
+    full_buffer_written_cb(filename);
+  };
 }
 
 Session::~Session()
 {
   cancel();
 
+  /* TODO(sergey): Bring the passes in viewport back.
+   * It is unclear why there is such an exception needed though. */
+#if 0
   if (buffers && params.write_render_cb) {
     /* Copy to display buffer and write out image if requested */
     delete display;
@@ -116,12 +114,14 @@ Session::~Session()
     uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h);
     params.write_render_cb((uchar *)pixels, w, h, 4);
   }
+#endif
 
-  /* clean up */
-  tile_manager.device_free();
+  /* Make sure path tracer is destroyed before the deviec. This is needed because destruction might
+   * need to access device for device memory free. */
+  /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the
+   * pre-defined order. */
+  path_trace_.reset();
 
-  delete buffers;
-  delete display;
   delete scene;
   delete device;
 
@@ -135,15 +135,16 @@ void Session::start()
   }
 }
 
-void Session::cancel()
+void Session::cancel(bool quick)
 {
+  if (quick && path_trace_) {
+    path_trace_->cancel();
+  }
+
   if (session_thread_) {
     /* wait for session thread to end */
     progress.set_cancel("Exiting");
 
-    gpu_need_display_buffer_update_ = false;
-    gpu_need_display_buffer_update_cond_.notify_all();
-
     {
       thread_scoped_lock pause_lock(pause_mutex_);
       pause_ = false;
@@ -157,570 +158,43 @@ void Session::cancel()
 
 bool Session::ready_to_reset()
 {
-  double dt = time_dt() - reset_time_;
-
-  if (!display_outdated_)
-    return (dt > params.reset_timeout);
-  else
-    return (dt > params.cancel_timeout);
+  return path_trace_->ready_to_reset();
 }
 
-/* GPU Session */
-
-void Session::reset_gpu(BufferParams &buffer_params, int samples)
+void Session::run_main_render_loop()
 {
-  thread_scoped_lock pause_lock(pause_mutex_);
-
-  /* block for buffer access and reset immediately. we can't do this
-   * in the thread, because we need to allocate an OpenGL buffer, and
-   * that only works in the main thread */
-  thread_scoped_lock display_lock(display_mutex_);
-  thread_scoped_lock buffers_lock(buffers_mutex_);
+  path_trace_->clear_gpu_display();
 
-  display_outdated_ = true;
-  reset_time_ = time_dt();
+  while (true) {
+    RenderWork render_work = run_update_for_next_iteration();
 
-  reset_(buffer_params, samples);
-
-  gpu_need_display_buffer_update_ = false;
-  gpu_need_display_buffer_update_cond_.notify_all();
-
-  new_work_added_ = true;
-
-  pause_cond_.notify_all();
-}
-
-bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
-  /* block for buffer access */
-  thread_scoped_lock display_lock(display_mutex_);
-
-  /* first check we already rendered something */
-  if (gpu_draw_ready_) {
-    /* then verify the buffers have the expected size, so we don't
-     * draw previous results in a resized window */
-    if (buffer_params.width == display->params.width &&
-        buffer_params.height == display->params.height) {
-      /* for CUDA we need to do tone-mapping still, since we can
-       * only access GL buffers from the main thread. */
-      if (gpu_need_display_buffer_update_) {
-        thread_scoped_lock buffers_lock(buffers_mutex_);
-        copy_to_display_buffer(tile_manager.state.sample);
-        gpu_need_display_buffer_update_ = false;
-        gpu_need_display_buffer_update_cond_.notify_all();
+    if (!render_work) {
+      if (VLOG_IS_ON(2)) {
+        double total_time, render_time;
+        progress.get_time(total_time, render_time);
+        VLOG(2) << "Rendering in main loop is done in " << render_time << " seconds.";
+        VLOG(2) << path_trace_->full_report();
       }
 
-      display->draw(device, draw_params);
-
-      if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
-        return false;
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-void Session::run_gpu()
-{
-  bool tiles_written = false;
-
-  reset_time_ = time_dt();
-  last_update_time_ = time_dt();
-  last_display_time_ = last_update_time_;
-
-  progress.set_render_start_time();
-
-  while (!progress.get_cancel()) {
-    const bool no_tiles = !run_update_for_next_iteration();
-
-    if (no_tiles) {
       if (params.background) {
-        /* if no work left and in background mode, we can stop immediately */
+        /* if no work left and in background mode, we can stop immediately. */
         progress.set_status("Finished");
         break;
       }
     }
 
-    if (run_wait_for_work(no_tiles)) {
-      continue;
-    }
-
-    if (progress.get_cancel()) {
-      break;
-    }
-
-    if (!no_tiles) {
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      if (progress.get_cancel())
-        break;
-
-      /* buffers mutex is locked entirely while rendering each
-       * sample, and released/reacquired on each iteration to allow
-       * reset and draw in between */
-      thread_scoped_lock buffers_lock(buffers_mutex_);
-
-      /* update status and timing */
-      update_status_time();
-
-      /* render */
-      bool delayed_denoise = false;
-      const bool need_denoise = render_need_denoise(delayed_denoise);
-      render(need_denoise);
-
-      device->task_wait();
-
-      if (!device->error_message().empty())
-        progress.set_cancel(device->error_message());
-
-      /* update status and timing */
-      update_status_time();
-
-      gpu_need_display_buffer_update_ = !delayed_denoise;
-      gpu_draw_ready_ = true;
-      progress.set_update();
-
-      /* wait for until display buffer is updated */
-      if (!params.background) {
-        while (gpu_need_display_buffer_update_) {
-          if (progress.get_cancel())
-            break;
-
-          gpu_need_display_buffer_update_cond_.wait(buffers_lock);
-        }
-      }
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      tiles_written = update_progressive_refine(progress.get_cancel());
-
-      if (progress.get_cancel())
-        break;
-    }
-  }
-
-  if (!tiles_written)
-    update_progressive_refine(true);
-}
-
-/* CPU Session */
-
-void Session::reset_cpu(BufferParams &buffer_params, int samples)
-{
-  thread_scoped_lock reset_lock(delayed_reset_.mutex);
-  thread_scoped_lock pause_lock(pause_mutex_);
-
-  display_outdated_ = true;
-  reset_time_ = time_dt();
-
-  delayed_reset_.params = buffer_params;
-  delayed_reset_.samples = samples;
-  delayed_reset_.do_reset = true;
-  device->task_cancel();
-
-  pause_cond_.notify_all();
-}
-
-bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
-  thread_scoped_lock display_lock(display_mutex_);
-
-  /* first check we already rendered something */
-  if (display->draw_ready()) {
-    /* then verify the buffers have the expected size, so we don't
-     * draw previous results in a resized window */
-    if (buffer_params.width == display->params.width &&
-        buffer_params.height == display->params.height) {
-      display->draw(device, draw_params);
-
-      if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
-        return false;
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Session::steal_tile(RenderTile &rtile, Device *tile_device, thread_scoped_lock &tile_lock)
-{
-  /* Devices that can get their tiles stolen don't steal tiles themselves.
-   * Additionally, if there are no stealable tiles in flight, give up here. */
-  if (tile_device->info.type == DEVICE_CPU || stealable_tiles_ == 0) {
-    return false;
-  }
-
-  /* Wait until no other thread is trying to steal a tile. */
-  while (tile_stealing_state_ != NOT_STEALING && stealable_tiles_ > 0) {
-    /* Someone else is currently trying to get a tile.
-     * Wait on the condition variable and try later. */
-    tile_steal_cond_.wait(tile_lock);
-  }
-  /* If another thread stole the last stealable tile in the meantime, give up. */
-  if (stealable_tiles_ == 0) {
-    return false;
-  }
-
-  /* There are stealable tiles in flight, so signal that one should be released. */
-  tile_stealing_state_ = WAITING_FOR_TILE;
-
-  /* Wait until a device notices the signal and releases its tile. */
-  while (tile_stealing_state_ != GOT_TILE && stealable_tiles_ > 0) {
-    tile_steal_cond_.wait(tile_lock);
-  }
-  /* If the last stealable tile finished on its own, give up. */
-  if (tile_stealing_state_ != GOT_TILE) {
-    tile_stealing_state_ = NOT_STEALING;
-    return false;
-  }
-
-  /* Successfully stole a tile, now move it to the new device. */
-  rtile = stolen_tile_;
-  rtile.buffers->buffer.move_device(tile_device);
-  rtile.buffer = rtile.buffers->buffer.device_pointer;
-  rtile.stealing_state = RenderTile::NO_STEALING;
-  rtile.num_samples -= (rtile.sample - rtile.start_sample);
-  rtile.start_sample = rtile.sample;
-
-  tile_stealing_state_ = NOT_STEALING;
-
-  /* Poke any threads which might be waiting for NOT_STEALING above. */
-  tile_steal_cond_.notify_one();
-
-  return true;
-}
-
-bool Session::get_tile_stolen()
-{
-  /* If tile_stealing_state is WAITING_FOR_TILE, atomically set it to RELEASING_TILE
-   * and return true. */
-  TileStealingState expected = WAITING_FOR_TILE;
-  return tile_stealing_state_.compare_exchange_weak(expected, RELEASING_TILE);
-}
-
-bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
-{
-  if (progress.get_cancel()) {
-    if (params.progressive_refine == false) {
-      /* for progressive refine current sample should be finished for all tiles */
-      return false;
-    }
-  }
-
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  /* get next tile from manager */
-  Tile *tile;
-  int device_num = device->device_number(tile_device);
-
-  while (!tile_manager.next_tile(tile, device_num, tile_types)) {
-    /* Can only steal tiles on devices that support rendering
-     * This is because denoising tiles cannot be stolen (see below)
-     */
-    if ((tile_types & (RenderTile::PATH_TRACE | RenderTile::BAKE)) &&
-        steal_tile(rtile, tile_device, tile_lock)) {
-      return true;
-    }
-
-    /* Wait for denoising tiles to become available */
-    if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) {
-      denoising_cond_.wait(tile_lock);
-      continue;
-    }
-
-    return false;
-  }
-
-  /* fill render tile */
-  rtile.x = tile_manager.state.buffer.full_x + tile->x;
-  rtile.y = tile_manager.state.buffer.full_y + tile->y;
-  rtile.w = tile->w;
-  rtile.h = tile->h;
-  rtile.start_sample = tile_manager.state.sample;
-  rtile.num_samples = tile_manager.state.num_samples;
-  rtile.resolution = tile_manager.state.resolution_divider;
-  rtile.tile_index = tile->index;
-  rtile.stealing_state = RenderTile::NO_STEALING;
-
-  if (tile->state == Tile::DENOISE) {
-    rtile.task = RenderTile::DENOISE;
-  }
-  else {
-    if (tile_device->info.type == DEVICE_CPU) {
-      stealable_tiles_++;
-      rtile.stealing_state = RenderTile::CAN_BE_STOLEN;
-    }
-
-    if (read_bake_tile_cb) {
-      rtile.task = RenderTile::BAKE;
-    }
-    else {
-      rtile.task = RenderTile::PATH_TRACE;
-    }
-  }
-
-  tile_lock.unlock();
-
-  /* in case of a permanent buffer, return it, otherwise we will allocate
-   * a new temporary buffer */
-  if (buffers) {
-    tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
-    rtile.buffer = buffers->buffer.device_pointer;
-    rtile.buffers = buffers;
-
-    device->map_tile(tile_device, rtile);
-
-    /* Reset copy state, since buffer contents change after the tile was acquired */
-    buffers->map_neighbor_copied = false;
-
-    /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts
-     * for the buffer resolution divider. */
-    buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) /
-                                 tile_manager.state.resolution_divider;
-    buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider;
-
-    return true;
-  }
-
-  if (tile->buffers == NULL) {
-    /* fill buffer parameters */
-    BufferParams buffer_params = tile_manager.params;
-    buffer_params.full_x = rtile.x;
-    buffer_params.full_y = rtile.y;
-    buffer_params.width = rtile.w;
-    buffer_params.height = rtile.h;
-
-    /* allocate buffers */
-    tile->buffers = new RenderBuffers(tile_device);
-    tile->buffers->reset(buffer_params);
-  }
-  else if (tile->buffers->buffer.device != tile_device) {
-    /* Move buffer to current tile device again in case it was stolen before.
-     * Not needed for denoising since that already handles mapping of tiles and
-     * neighbors to its own device. */
-    if (rtile.task != RenderTile::DENOISE) {
-      tile->buffers->buffer.move_device(tile_device);
-    }
-  }
-
-  tile->buffers->map_neighbor_copied = false;
-
-  tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
-  rtile.buffer = tile->buffers->buffer.device_pointer;
-  rtile.buffers = tile->buffers;
-  rtile.sample = tile_manager.state.sample;
-
-  if (read_bake_tile_cb) {
-    /* This will read any passes needed as input for baking. */
-    if (tile_manager.state.sample == tile_manager.range_start_sample) {
-      {
-        thread_scoped_lock tile_lock(tile_mutex_);
-        read_bake_tile_cb(rtile);
-      }
-      rtile.buffers->buffer.copy_to_device();
-    }
-  }
-  else {
-    /* This will tag tile as IN PROGRESS in blender-side render pipeline,
-     * which is needed to highlight currently rendering tile before first
-     * sample was processed for it. */
-    update_tile_sample(rtile);
-  }
-
-  return true;
-}
-
-void Session::update_tile_sample(RenderTile &rtile)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  if (update_render_tile_cb) {
-    if (params.progressive_refine == false) {
-      /* todo: optimize this by making it thread safe and removing lock */
-
-      update_render_tile_cb(rtile, true);
-    }
-  }
-
-  update_status_time();
-}
-
-void Session::release_tile(RenderTile &rtile, const bool need_denoise)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  if (rtile.stealing_state != RenderTile::NO_STEALING) {
-    stealable_tiles_--;
-    if (rtile.stealing_state == RenderTile::WAS_STOLEN) {
-      /* If the tile is being stolen, don't release it here - the new device will pick up where
-       * the old one left off. */
-
-      assert(tile_stealing_state_ == RELEASING_TILE);
-      assert(rtile.sample < rtile.start_sample + rtile.num_samples);
-
-      tile_stealing_state_ = GOT_TILE;
-      stolen_tile_ = rtile;
-      tile_steal_cond_.notify_all();
-      return;
-    }
-    else if (stealable_tiles_ == 0) {
-      /* If this was the last stealable tile, wake up any threads still waiting for one. */
-      tile_steal_cond_.notify_all();
-    }
-  }
-
-  progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
-
-  bool delete_tile;
-
-  if (tile_manager.finish_tile(rtile.tile_index, need_denoise, delete_tile)) {
-    /* Finished tile pixels write. */
-    if (write_render_tile_cb && params.progressive_refine == false) {
-      write_render_tile_cb(rtile);
-    }
-
-    if (delete_tile) {
-      delete rtile.buffers;
-      tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
-    }
-  }
-  else {
-    /* In progress tile pixels update. */
-    if (update_render_tile_cb && params.progressive_refine == false) {
-      update_render_tile_cb(rtile, false);
-    }
-  }
-
-  update_status_time();
-
-  /* Notify denoising thread that a tile was finished. */
-  denoising_cond_.notify_all();
-}
-
-void Session::map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  const int4 image_region = make_int4(
-      tile_manager.state.buffer.full_x,
-      tile_manager.state.buffer.full_y,
-      tile_manager.state.buffer.full_x + tile_manager.state.buffer.width,
-      tile_manager.state.buffer.full_y + tile_manager.state.buffer.height);
-
-  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-
-  if (!tile_manager.schedule_denoising) {
-    /* Fix up tile slices with overlap. */
-    if (tile_manager.slice_overlap != 0) {
-      int y = max(center_tile.y - tile_manager.slice_overlap, image_region.y);
-      center_tile.h = min(center_tile.y + center_tile.h + tile_manager.slice_overlap,
-                          image_region.w) -
-                      y;
-      center_tile.y = y;
-    }
-
-    /* Tiles are not being denoised individually, which means the entire image is processed. */
-    neighbors.set_bounds_from_center();
-  }
-  else {
-    int center_idx = center_tile.tile_index;
-    assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
-
-    for (int dy = -1, i = 0; dy <= 1; dy++) {
-      for (int dx = -1; dx <= 1; dx++, i++) {
-        RenderTile &rtile = neighbors.tiles[i];
-        int nindex = tile_manager.get_neighbor_index(center_idx, i);
-        if (nindex >= 0) {
-          Tile *tile = &tile_manager.state.tiles[nindex];
-
-          rtile.x = image_region.x + tile->x;
-          rtile.y = image_region.y + tile->y;
-          rtile.w = tile->w;
-          rtile.h = tile->h;
-
-          if (buffers) {
-            tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
-            rtile.buffer = buffers->buffer.device_pointer;
-            rtile.buffers = buffers;
-          }
-          else {
-            assert(tile->buffers);
-            tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
-            rtile.buffer = tile->buffers->buffer.device_pointer;
-            rtile.buffers = tile->buffers;
-          }
-        }
-        else {
-          int px = center_tile.x + dx * params.tile_size.x;
-          int py = center_tile.y + dy * params.tile_size.y;
-
-          rtile.x = clamp(px, image_region.x, image_region.z);
-          rtile.y = clamp(py, image_region.y, image_region.w);
-          rtile.w = rtile.h = 0;
-
-          rtile.buffer = (device_ptr)NULL;
-          rtile.buffers = NULL;
-        }
-      }
-    }
-  }
-
-  assert(center_tile.buffers);
-  device->map_neighbor_tiles(tile_device, neighbors);
-
-  /* The denoised result is written back to the original tile. */
-  neighbors.target = center_tile;
-}
-
-void Session::unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-  device->unmap_neighbor_tiles(tile_device, neighbors);
-}
-
-void Session::run_cpu()
-{
-  bool tiles_written = false;
-
-  last_update_time_ = time_dt();
-  last_display_time_ = last_update_time_;
-
-  while (!progress.get_cancel()) {
-    const bool no_tiles = !run_update_for_next_iteration();
-    bool need_copy_to_display_buffer = false;
-
-    if (no_tiles) {
-      if (params.background) {
-        /* if no work left and in background mode, we can stop immediately */
-        progress.set_status("Finished");
+    const bool did_cancel = progress.get_cancel();
+    if (did_cancel) {
+      render_scheduler_.render_work_reschedule_on_cancel(render_work);
+      if (!render_work) {
         break;
       }
     }
-
-    if (run_wait_for_work(no_tiles)) {
+    else if (run_wait_for_work(render_work)) {
       continue;
     }
 
-    if (progress.get_cancel()) {
-      break;
-    }
-
-    if (!no_tiles) {
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      if (progress.get_cancel())
-        break;
-
+    {
       /* buffers mutex is locked entirely while rendering each
        * sample, and released/reacquired on each iteration to allow
        * reset and draw in between */
@@ -730,49 +204,25 @@ void Session::run_cpu()
       update_status_time();
 
       /* render */
-      bool delayed_denoise = false;
-      const bool need_denoise = render_need_denoise(delayed_denoise);
-      render(need_denoise);
+      path_trace_->render(render_work);
 
       /* update status and timing */
       update_status_time();
 
-      if (!params.background)
-        need_copy_to_display_buffer = !delayed_denoise;
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-    }
-
-    device->task_wait();
-
-    {
-      thread_scoped_lock reset_lock(delayed_reset_.mutex);
-      thread_scoped_lock buffers_lock(buffers_mutex_);
-      thread_scoped_lock display_lock(display_mutex_);
-
-      if (delayed_reset_.do_reset) {
-        /* reset rendering if request from main thread */
-        delayed_reset_.do_reset = false;
-        reset_(delayed_reset_.params, delayed_reset_.samples);
-      }
-      else if (need_copy_to_display_buffer) {
-        /* Only copy to display_buffer if we do not reset, we don't
-         * want to show the result of an incomplete sample */
-        copy_to_display_buffer(tile_manager.state.sample);
+      if (device->have_error()) {
+        const string &error_message = device->error_message();
+        progress.set_error(error_message);
+        progress.set_cancel(error_message);
+        break;
       }
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      tiles_written = update_progressive_refine(progress.get_cancel());
     }
 
     progress.set_update();
-  }
 
-  if (!tiles_written)
-    update_progressive_refine(true);
+    if (did_cancel) {
+      break;
+    }
+  }
 }
 
 void Session::run()
@@ -789,10 +239,7 @@ void Session::run()
     /* reset number of rendered samples */
     progress.reset_sample();
 
-    if (device_use_gl_)
-      run_gpu();
-    else
-      run_cpu();
+    run_main_render_loop();
   }
 
   profiler.stop();
@@ -804,31 +251,92 @@ void Session::run()
     progress.set_update();
 }
 
-bool Session::run_update_for_next_iteration()
+RenderWork Session::run_update_for_next_iteration()
 {
+  RenderWork render_work;
+
   thread_scoped_lock scene_lock(scene->mutex);
   thread_scoped_lock reset_lock(delayed_reset_.mutex);
 
+  bool have_tiles = true;
+  bool switched_to_new_tile = false;
+
   if (delayed_reset_.do_reset) {
     thread_scoped_lock buffers_lock(buffers_mutex_);
-    reset_(delayed_reset_.params, delayed_reset_.samples);
-    delayed_reset_.do_reset = false;
+    do_delayed_reset();
+
+    /* After reset make sure the tile manager is at the first big tile. */
+    have_tiles = tile_manager_.next();
+    switched_to_new_tile = true;
+  }
+
+  /* Update number of samples in the integrator.
+   * Ideally this would need to happen once in `Session::set_samples()`, but the issue there is
+   * the initial configuration when Session is created where the `set_samples()` is not used. */
+  scene->integrator->set_aa_samples(params.samples);
+
+  /* Update denoiser settings. */
+  {
+    const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+    path_trace_->set_denoiser_params(denoise_params);
+  }
+
+  /* Update adaptive sampling. */
+  {
+    const AdaptiveSampling adaptive_sampling = scene->integrator->get_adaptive_sampling();
+    path_trace_->set_adaptive_sampling(adaptive_sampling);
   }
 
-  const bool have_tiles = tile_manager.next();
+  render_scheduler_.set_num_samples(params.samples);
+  render_scheduler_.set_time_limit(params.time_limit);
+
+  while (have_tiles) {
+    render_work = render_scheduler_.get_render_work();
+    if (render_work) {
+      break;
+    }
 
-  if (have_tiles) {
+    progress.add_finished_tile(false);
+
+    have_tiles = tile_manager_.next();
+    if (have_tiles) {
+      render_scheduler_.reset_for_next_tile();
+      switched_to_new_tile = true;
+    }
+  }
+
+  if (render_work) {
     scoped_timer update_timer;
-    if (update_scene()) {
+
+    if (switched_to_new_tile) {
+      BufferParams tile_params = buffer_params_;
+
+      const Tile &tile = tile_manager_.get_current_tile();
+      tile_params.width = tile.width;
+      tile_params.height = tile.height;
+      tile_params.full_x = tile.x + buffer_params_.full_x;
+      tile_params.full_y = tile.y + buffer_params_.full_y;
+      tile_params.full_width = buffer_params_.full_width;
+      tile_params.full_height = buffer_params_.full_height;
+      tile_params.update_offset_stride();
+
+      path_trace_->reset(buffer_params_, tile_params);
+    }
+
+    const int resolution = render_work.resolution_divider;
+    const int width = max(1, buffer_params_.full_width / resolution);
+    const int height = max(1, buffer_params_.full_height / resolution);
+
+    if (update_scene(width, height)) {
       profiler.reset(scene->shaders.size(), scene->objects.size());
     }
     progress.add_skip_time(update_timer, params.background);
   }
 
-  return have_tiles;
+  return render_work;
 }
 
-bool Session::run_wait_for_work(bool no_tiles)
+bool Session::run_wait_for_work(const RenderWork &render_work)
 {
   /* In an offline rendering there is no pause, and no tiles will mean the job is fully done. */
   if (params.background) {
@@ -837,19 +345,20 @@ bool Session::run_wait_for_work(bool no_tiles)
 
   thread_scoped_lock pause_lock(pause_mutex_);
 
-  if (!pause_ && !no_tiles) {
+  if (!pause_ && render_work) {
     /* Rendering is not paused and there is work to be done. No need to wait for anything. */
     return false;
   }
 
-  update_status_time(pause_, no_tiles);
+  const bool no_work = !render_work;
+  update_status_time(pause_, no_work);
 
   /* Only leave the loop when rendering is not paused. But even if the current render is un-paused
    * but there is nothing to render keep waiting until new work is added. */
   while (!cancel_) {
     scoped_timer pause_timer;
 
-    if (!pause_ && (!no_tiles || new_work_added_ || delayed_reset_.do_reset)) {
+    if (!pause_ && (render_work || new_work_added_ || delayed_reset_.do_reset)) {
       break;
     }
 
@@ -860,52 +369,88 @@ bool Session::run_wait_for_work(bool no_tiles)
       progress.add_skip_time(pause_timer, params.background);
     }
 
-    update_status_time(pause_, no_tiles);
+    update_status_time(pause_, no_work);
     progress.set_update();
   }
 
   new_work_added_ = false;
 
-  return no_tiles;
+  return no_work;
 }
 
-bool Session::draw(BufferParams &buffer_params, DeviceDrawParams &draw_params)
+void Session::draw()
 {
-  if (device_use_gl_)
-    return draw_gpu(buffer_params, draw_params);
-  else
-    return draw_cpu(buffer_params, draw_params);
+  path_trace_->draw();
 }
 
-void Session::reset_(BufferParams &buffer_params, int samples)
+int2 Session::get_effective_tile_size() const
 {
-  if (buffers && buffer_params.modified(tile_manager.params)) {
-    gpu_draw_ready_ = false;
-    buffers->reset(buffer_params);
-    if (display) {
-      display->reset(buffer_params);
-    }
+  /* No support yet for baking with tiles. */
+  if (!params.use_auto_tile || scene->bake_manager->get_baking()) {
+    return make_int2(buffer_params_.width, buffer_params_.height);
   }
 
-  tile_manager.reset(buffer_params, samples);
-  stealable_tiles_ = 0;
-  tile_stealing_state_ = NOT_STEALING;
-  progress.reset_sample();
+  /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile
+   * and prefer optimal performance. */
+
+  return make_int2(params.tile_size, params.tile_size);
+}
+
+void Session::do_delayed_reset()
+{
+  if (!delayed_reset_.do_reset) {
+    return;
+  }
+  delayed_reset_.do_reset = false;
+
+  params = delayed_reset_.session_params;
+  buffer_params_ = delayed_reset_.buffer_params;
+
+  /* Store parameters used for buffers access outside of scene graph.  */
+  buffer_params_.exposure = scene->film->get_exposure();
+  buffer_params_.use_approximate_shadow_catcher =
+      scene->film->get_use_approximate_shadow_catcher();
+  buffer_params_.use_transparent_background = scene->background->get_transparent();
 
-  bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX;
-  progress.set_total_pixel_samples(show_progress ? tile_manager.state.total_pixel_samples : 0);
+  /* Tile and work scheduling. */
+  tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size());
+  render_scheduler_.reset(buffer_params_, params.samples);
 
-  if (!params.background)
+  /* Passes. */
+  /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial
+   * tile results. It is safe to use generic update function here which checks for changes since
+   * changes in tile settings re-creates session, which ensures film is fully updated on tile
+   * changes. */
+  scene->film->update_passes(scene, tile_manager_.has_multiple_tiles());
+
+  /* Update for new state of scene and passes. */
+  buffer_params_.update_passes(scene->passes);
+  tile_manager_.update(buffer_params_, scene);
+
+  /* Progress. */
+  progress.reset_sample();
+  progress.set_total_pixel_samples(buffer_params_.width * buffer_params_.height * params.samples);
+
+  if (!params.background) {
     progress.set_start_time();
+  }
   progress.set_render_start_time();
 }
 
-void Session::reset(BufferParams &buffer_params, int samples)
+void Session::reset(const SessionParams &session_params, const BufferParams &buffer_params)
 {
-  if (device_use_gl_)
-    reset_gpu(buffer_params, samples);
-  else
-    reset_cpu(buffer_params, samples);
+  {
+    thread_scoped_lock reset_lock(delayed_reset_.mutex);
+    thread_scoped_lock pause_lock(pause_mutex_);
+
+    delayed_reset_.do_reset = true;
+    delayed_reset_.session_params = session_params;
+    delayed_reset_.buffer_params = buffer_params;
+
+    path_trace_->cancel();
+  }
+
+  pause_cond_.notify_all();
 }
 
 void Session::set_samples(int samples)
@@ -915,7 +460,22 @@ void Session::set_samples(int samples)
   }
 
   params.samples = samples;
-  tile_manager.set_samples(samples);
+
+  {
+    thread_scoped_lock pause_lock(pause_mutex_);
+    new_work_added_ = true;
+  }
+
+  pause_cond_.notify_all();
+}
+
+void Session::set_time_limit(double time_limit)
+{
+  if (time_limit == params.time_limit) {
+    return;
+  }
+
+  params.time_limit = time_limit;
 
   {
     thread_scoped_lock pause_lock(pause_mutex_);
@@ -948,38 +508,9 @@ void Session::set_pause(bool pause)
   }
 }
 
-void Session::set_denoising(const DenoiseParams &denoising)
+void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
 {
-  bool need_denoise = denoising.need_denoising_task();
-
-  /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
-  thread_scoped_lock buffers_lock(buffers_mutex_);
-  params.denoising = denoising;
-
-  if (!(params.device.denoisers & denoising.type)) {
-    if (need_denoise) {
-      progress.set_error("Denoiser type not supported by compute device");
-    }
-
-    params.denoising.use = false;
-    need_denoise = false;
-  }
-
-  // TODO(pmours): Query the required overlap value for denoising from the device?
-  tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0;
-
-  /* Schedule per tile denoising for final renders if we are either denoising or
-   * need prefiltered passes for the native denoiser. */
-  tile_manager.schedule_denoising = need_denoise && !buffers;
-}
-
-void Session::set_denoising_start_sample(int sample)
-{
-  if (sample != params.denoising.start_sample) {
-    params.denoising.start_sample = sample;
-
-    pause_cond_.notify_all();
-  }
+  path_trace_->set_gpu_display(move(gpu_display));
 }
 
 void Session::wait()
@@ -989,81 +520,67 @@ void Session::wait()
     delete session_thread_;
   }
 
-  session_thread_ = NULL;
+  session_thread_ = nullptr;
 }
 
-bool Session::update_scene()
+bool Session::update_scene(int width, int height)
 {
-  /* update camera if dimensions changed for progressive render. the camera
+  /* Update camera if dimensions changed for progressive render. the camera
    * knows nothing about progressive or cropped rendering, it just gets the
-   * image dimensions passed in */
+   * image dimensions passed in. */
   Camera *cam = scene->camera;
-  int width = tile_manager.state.buffer.full_width;
-  int height = tile_manager.state.buffer.full_height;
-  int resolution = tile_manager.state.resolution_divider;
-
-  cam->set_screen_size_and_resolution(width, height, resolution);
+  cam->set_screen_size(width, height);
 
-  /* number of samples is needed by multi jittered
-   * sampling pattern and by baking */
-  Integrator *integrator = scene->integrator;
-  BakeManager *bake_manager = scene->bake_manager;
+  /* First detect which kernel features are used and allocate working memory.
+   * This helps estimate how may device memory is available for the scene and
+   * how much we need to allocate on the host instead. */
+  scene->update_kernel_features();
 
-  if (integrator->get_sampling_pattern() != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) {
-    integrator->set_aa_samples(tile_manager.num_samples);
-  }
+  path_trace_->load_kernels();
+  path_trace_->alloc_work_memory();
 
-  bool kernel_switch_needed = false;
-  if (scene->update(progress, kernel_switch_needed)) {
-    if (kernel_switch_needed) {
-      reset(tile_manager.params, params.samples);
-    }
+  if (scene->update(progress)) {
     return true;
   }
+
   return false;
 }
 
+static string status_append(const string &status, const string &suffix)
+{
+  string prefix = status;
+  if (!prefix.empty()) {
+    prefix += ", ";
+  }
+  return prefix + suffix;
+}
+
 void Session::update_status_time(bool show_pause, bool show_done)
 {
-  int progressive_sample = tile_manager.state.sample;
-  int num_samples = tile_manager.get_num_effective_samples();
+  string status, substatus;
 
-  int tile = progress.get_rendered_tiles();
-  int num_tiles = tile_manager.state.num_tiles;
+  const int current_tile = progress.get_rendered_tiles();
+  const int num_tiles = tile_manager_.get_num_tiles();
 
-  /* update status */
-  string status, substatus;
+  const int current_sample = progress.get_current_sample();
+  const int num_samples = render_scheduler_.get_num_samples();
 
-  if (!params.progressive) {
-    const bool is_cpu = params.device.type == DEVICE_CPU;
-    const bool rendering_finished = (tile == num_tiles);
-    const bool is_last_tile = (tile + 1) == num_tiles;
-
-    substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles);
-
-    if (!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
-      /* Some devices automatically support showing the sample number:
-       * - CUDADevice
-       * - OpenCLDevice when using the megakernel (the split kernel renders multiple
-       *   samples at the same time, so the current sample isn't really defined)
-       * - CPUDevice when using one thread
-       * For these devices, the current sample is always shown.
-       *
-       * The other option is when the last tile is currently being rendered by the CPU.
-       */
-      substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
-    }
-    if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) {
-      substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
-    }
-    else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) {
-      substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles());
-    }
+  /* TIle. */
+  if (tile_manager_.has_multiple_tiles()) {
+    substatus = status_append(substatus,
+                              string_printf("Rendered %d/%d Tiles", current_tile, num_tiles));
   }
-  else if (tile_manager.num_samples == Integrator::MAX_SAMPLES)
-    substatus = string_printf("Path Tracing Sample %d", progressive_sample + 1);
-  else
-    substatus = string_printf("Path Tracing Sample %d/%d", progressive_sample + 1, num_samples);
+
+  /* Sample. */
+  if (num_samples == Integrator::MAX_SAMPLES) {
+    substatus = status_append(substatus, string_printf("Sample %d", current_sample));
+  }
+  else {
+    substatus = status_append(substatus,
+                              string_printf("Sample %d/%d", current_sample, num_samples));
+  }
+
+  /* TODO(sergey): Denoising status from the path trace. */
 
   if (show_pause) {
     status = "Rendering Paused";
@@ -1080,210 +597,122 @@ void Session::update_status_time(bool show_pause, bool show_done)
   progress.set_status(status, substatus);
 }
 
-bool Session::render_need_denoise(bool &delayed)
+void Session::device_free()
 {
-  delayed = false;
-
-  /* Not supported yet for baking. */
-  if (read_bake_tile_cb) {
-    return false;
-  }
-
-  /* Denoising enabled? */
-  if (!params.denoising.need_denoising_task()) {
-    return false;
-  }
-
-  if (params.background) {
-    /* Background render, only denoise when rendering the last sample. */
-    return tile_manager.done();
-  }
-
-  /* Viewport render. */
-
-  /* It can happen that denoising was already enabled, but the scene still needs an update. */
-  if (scene->film->is_modified() || !scene->film->get_denoising_data_offset()) {
-    return false;
-  }
+  scene->device_free();
+  path_trace_->device_free();
+}
 
-  /* Immediately denoise when we reach the start sample or last sample. */
-  const int num_samples_finished = tile_manager.state.sample + 1;
-  if (num_samples_finished == params.denoising.start_sample ||
-      num_samples_finished == params.samples) {
-    return true;
+void Session::collect_statistics(RenderStats *render_stats)
+{
+  scene->collect_statistics(render_stats);
+  if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
+    render_stats->collect_profiling(scene, profiler);
   }
+}
 
-  /* Do not denoise until the sample at which denoising should start is reached. */
-  if (num_samples_finished < params.denoising.start_sample) {
-    return false;
-  }
+/* --------------------------------------------------------------------
+ * Tile and tile pixels aceess.
+ */
 
-  /* Avoid excessive denoising in viewport after reaching a certain amount of samples. */
-  delayed = (tile_manager.state.sample >= 20 &&
-             (time_dt() - last_display_time_) < params.progressive_update_timeout);
-  return !delayed;
+bool Session::has_multiple_render_tiles() const
+{
+  return tile_manager_.has_multiple_tiles();
 }
 
-void Session::render(bool need_denoise)
+int2 Session::get_render_tile_size() const
 {
-  if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
-    /* Clear buffers. */
-    buffers->zero();
-  }
-
-  if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) {
-    return; /* Avoid empty launches. */
-  }
+  return path_trace_->get_render_tile_size();
+}
 
-  /* Add path trace task. */
-  DeviceTask task(DeviceTask::RENDER);
-
-  task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3);
-  task.release_tile = function_bind(&Session::release_tile, this, _1, need_denoise);
-  task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
-  task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
-  task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
-  task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
-  task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
-  task.get_tile_stolen = function_bind(&Session::get_tile_stolen, this);
-  task.need_finish_queue = params.progressive_refine;
-  task.integrator_branched = scene->integrator->get_method() == Integrator::BRANCHED_PATH;
-
-  task.adaptive_sampling.use = (scene->integrator->get_sampling_pattern() ==
-                                SAMPLING_PATTERN_PMJ) &&
-                               scene->dscene.data.film.pass_adaptive_aux_buffer;
-  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
-  task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step;
-
-  /* Acquire render tiles by default. */
-  task.tile_types = RenderTile::PATH_TRACE;
-
-  if (need_denoise) {
-    task.denoising = params.denoising;
-
-    task.pass_stride = scene->film->get_pass_stride();
-    task.target_pass_stride = task.pass_stride;
-    task.pass_denoising_data = scene->film->get_denoising_data_offset();
-    task.pass_denoising_clean = scene->film->get_denoising_clean_offset();
-
-    task.denoising_from_render = true;
-
-    if (tile_manager.schedule_denoising) {
-      /* Acquire denoising tiles during rendering. */
-      task.tile_types |= RenderTile::DENOISE;
-    }
-    else {
-      assert(buffers);
-
-      /* Schedule rendering and wait for it to finish. */
-      device->task_add(task);
-      device->task_wait();
-
-      /* Then run denoising on the whole image at once. */
-      task.type = DeviceTask::DENOISE_BUFFER;
-      task.x = tile_manager.state.buffer.full_x;
-      task.y = tile_manager.state.buffer.full_y;
-      task.w = tile_manager.state.buffer.width;
-      task.h = tile_manager.state.buffer.height;
-      task.buffer = buffers->buffer.device_pointer;
-      task.sample = tile_manager.state.sample;
-      task.num_samples = tile_manager.state.num_samples;
-      tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-      task.buffers = buffers;
-    }
-  }
+int2 Session::get_render_tile_offset() const
+{
+  return path_trace_->get_render_tile_offset();
+}
 
-  device->task_add(task);
+string_view Session::get_render_tile_layer() const
+{
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+  return buffer_params.layer;
 }
 
-void Session::copy_to_display_buffer(int sample)
+string_view Session::get_render_tile_view() const
 {
-  /* add film conversion task */
-  DeviceTask task(DeviceTask::FILM_CONVERT);
-
-  task.x = tile_manager.state.buffer.full_x;
-  task.y = tile_manager.state.buffer.full_y;
-  task.w = tile_manager.state.buffer.width;
-  task.h = tile_manager.state.buffer.height;
-  task.rgba_byte = display->rgba_byte.device_pointer;
-  task.rgba_half = display->rgba_half.device_pointer;
-  task.buffer = buffers->buffer.device_pointer;
-  task.sample = sample;
-  tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-
-  if (task.w > 0 && task.h > 0) {
-    device->task_add(task);
-    device->task_wait();
-
-    /* set display to new size */
-    display->draw_set(task.w, task.h);
-
-    last_display_time_ = time_dt();
-  }
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+  return buffer_params.view;
+}
 
-  display_outdated_ = false;
+bool Session::copy_render_tile_from_device()
+{
+  return path_trace_->copy_render_tile_from_device();
 }
 
-bool Session::update_progressive_refine(bool cancel)
+bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels)
 {
-  int sample = tile_manager.state.sample + 1;
-  bool write = sample == tile_manager.num_samples || cancel;
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happenning while this function runs. */
 
-  double current_time = time_dt();
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
 
-  if (current_time - last_update_time_ < params.progressive_update_timeout) {
-    /* If last sample was processed, we need to write buffers anyway. */
-    if (!write && sample != 1)
-      return false;
+  const BufferPass *pass = buffer_params.find_pass(pass_name);
+  if (pass == nullptr) {
+    return false;
   }
 
-  if (params.progressive_refine) {
-    foreach (Tile &tile, tile_manager.state.tiles) {
-      if (!tile.buffers) {
-        continue;
-      }
-
-      RenderTile rtile;
-      rtile.x = tile_manager.state.buffer.full_x + tile.x;
-      rtile.y = tile_manager.state.buffer.full_y + tile.y;
-      rtile.w = tile.w;
-      rtile.h = tile.h;
-      rtile.sample = sample;
-      rtile.buffers = tile.buffers;
-
-      if (write) {
-        if (write_render_tile_cb)
-          write_render_tile_cb(rtile);
-      }
-      else {
-        if (update_render_tile_cb)
-          update_render_tile_cb(rtile, true);
-      }
+  const bool has_denoised_result = path_trace_->has_denoised_result();
+  if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
+    pass = buffer_params.find_pass(pass->type);
+    if (pass == nullptr) {
+      /* Happens when denoised result pass is requested but is never written by the kernel. */
+      return false;
     }
   }
 
-  last_update_time_ = current_time;
+  pass = buffer_params.get_actual_display_pass(pass);
+
+  const float exposure = buffer_params.exposure;
+  const int num_samples = path_trace_->get_num_render_tile_samples();
 
-  return write;
+  PassAccessor::PassAccessInfo pass_access_info(*pass);
+  pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background;
+
+  const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  const PassAccessor::Destination destination(pixels, num_components);
+
+  return path_trace_->get_render_tile_pixels(pass_accessor, destination);
 }
 
-void Session::device_free()
+bool Session::set_render_tile_pixels(const string &pass_name,
+                                     int num_components,
+                                     const float *pixels)
 {
-  scene->device_free();
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happenning while this function runs. */
+
+  const BufferPass *pass = buffer_params_.find_pass(pass_name);
+  if (!pass) {
+    return false;
+  }
+
+  const float exposure = scene->film->get_exposure();
+  const int num_samples = render_scheduler_.get_num_rendered_samples();
 
-  tile_manager.device_free();
+  const PassAccessor::PassAccessInfo pass_access_info(*pass);
+  PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  PassAccessor::Source source(pixels, num_components);
 
-  /* used from background render only, so no need to
-   * re-create render/display buffers here
-   */
+  return path_trace_->set_render_tile_pixels(pass_accessor, source);
 }
 
-void Session::collect_statistics(RenderStats *render_stats)
+/* --------------------------------------------------------------------
+ * Full-frame on-disk storage.
+ */
+
+void Session::process_full_buffer_from_disk(string_view filename)
 {
-  scene->collect_statistics(render_stats);
-  if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
-    render_stats->collect_profiling(scene, profiler);
-  }
+  path_trace_->process_full_buffer_from_disk(filename);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 05025c10f9c..492cfdd1c09 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -18,6 +18,7 @@
 #define __SESSION_H__
 
 #include "device/device.h"
+#include "integrator/render_scheduler.h"
 #include "render/buffers.h"
 #include "render/shader.h"
 #include "render/stats.h"
@@ -26,6 +27,7 @@
 #include "util/util_progress.h"
 #include "util/util_stats.h"
 #include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -33,41 +35,35 @@ CCL_NAMESPACE_BEGIN
 class BufferParams;
 class Device;
 class DeviceScene;
-class DeviceRequestedFeatures;
-class DisplayBuffer;
+class PathTrace;
 class Progress;
+class GPUDisplay;
 class RenderBuffers;
 class Scene;
+class SceneParams;
 
 /* Session Parameters */
 
 class SessionParams {
  public:
   DeviceInfo device;
+
+  bool headless;
   bool background;
-  bool progressive_refine;
 
-  bool progressive;
   bool experimental;
   int samples;
-  int2 tile_size;
-  TileOrder tile_order;
-  int start_resolution;
-  int denoising_start_sample;
   int pixel_size;
   int threads;
-  bool adaptive_sampling;
-
-  bool use_profiling;
 
-  bool display_buffer_linear;
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit;
 
-  DenoiseParams denoising;
+  bool use_profiling;
 
-  double cancel_timeout;
-  double reset_timeout;
-  double text_timeout;
-  double progressive_update_timeout;
+  bool use_auto_tile;
+  int tile_size;
 
   ShadingSystem shadingsystem;
 
@@ -75,50 +71,32 @@ class SessionParams {
 
   SessionParams()
   {
+    headless = false;
     background = false;
-    progressive_refine = false;
 
-    progressive = false;
     experimental = false;
     samples = 1024;
-    tile_size = make_int2(64, 64);
-    start_resolution = INT_MAX;
-    denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
-    adaptive_sampling = false;
+    time_limit = 0.0;
 
     use_profiling = false;
 
-    display_buffer_linear = false;
-
-    cancel_timeout = 0.1;
-    reset_timeout = 0.1;
-    text_timeout = 1.0;
-    progressive_update_timeout = 1.0;
+    use_auto_tile = true;
+    tile_size = 2048;
 
     shadingsystem = SHADINGSYSTEM_SVM;
-    tile_order = TILE_CENTER;
   }
 
-  bool modified(const SessionParams &params)
+  bool modified(const SessionParams &params) const
   {
     /* Modified means we have to recreate the session, any parameter changes
      * that can be handled by an existing Session are omitted. */
-    return !(device == params.device && background == params.background &&
-             progressive_refine == params.progressive_refine &&
-             progressive == params.progressive && experimental == params.experimental &&
-             tile_size == params.tile_size && start_resolution == params.start_resolution &&
+    return !(device == params.device && headless == params.headless &&
+             background == params.background && experimental == params.experimental &&
              pixel_size == params.pixel_size && threads == params.threads &&
-             adaptive_sampling == params.adaptive_sampling &&
-             use_profiling == params.use_profiling &&
-             display_buffer_linear == params.display_buffer_linear &&
-             cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
-             text_timeout == params.text_timeout &&
-             progressive_update_timeout == params.progressive_update_timeout &&
-             tile_order == params.tile_order && shadingsystem == params.shadingsystem &&
-             denoising.type == params.denoising.type &&
-             (denoising.use == params.denoising.use || (device.denoisers & denoising.type)));
+             use_profiling == params.use_profiling && shadingsystem == params.shadingsystem &&
+             use_auto_tile == params.use_auto_tile && tile_size == params.tile_size);
   }
 };
 
@@ -131,34 +109,41 @@ class Session {
  public:
   Device *device;
   Scene *scene;
-  RenderBuffers *buffers;
-  DisplayBuffer *display;
   Progress progress;
   SessionParams params;
-  TileManager tile_manager;
   Stats stats;
   Profiler profiler;
 
-  function<void(RenderTile &)> write_render_tile_cb;
-  function<void(RenderTile &, bool)> update_render_tile_cb;
-  function<void(RenderTile &)> read_bake_tile_cb;
+  function<void(void)> write_render_tile_cb;
+  function<void(void)> update_render_tile_cb;
+  function<void(void)> read_render_tile_cb;
+
+  /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after
+   * writing. Allows an engine integration to keep track of those files without worry about
+   * transfering the information when it needs to re-create session during rendering. */
+  function<void(string_view)> full_buffer_written_cb;
 
-  explicit Session(const SessionParams &params);
+  explicit Session(const SessionParams &params, const SceneParams &scene_params);
   ~Session();
 
   void start();
-  void cancel();
-  bool draw(BufferParams &params, DeviceDrawParams &draw_params);
+
+  /* When quick cancel is requested path tracing is cancelles as soon as possible, without waiting
+   * for the buffer to be uniformly sampled. */
+  void cancel(bool quick = false);
+
+  void draw();
   void wait();
 
   bool ready_to_reset();
-  void reset(BufferParams &params, int samples);
+  void reset(const SessionParams &session_params, const BufferParams &buffer_params);
+
   void set_pause(bool pause);
+
   void set_samples(int samples);
-  void set_denoising(const DenoiseParams &denoising);
-  void set_denoising_start_sample(int sample);
+  void set_time_limit(double time_limit);
 
-  bool update_scene();
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
 
   void device_free();
 
@@ -168,83 +153,95 @@ class Session {
 
   void collect_statistics(RenderStats *stats);
 
- protected:
-  struct DelayedReset {
-    thread_mutex mutex;
-    bool do_reset;
-    BufferParams params;
-    int samples;
-  } delayed_reset_;
+  /* --------------------------------------------------------------------
+   * Tile and tile pixels aceess.
+   */
 
-  void run();
+  bool has_multiple_render_tiles() const;
 
-  bool run_update_for_next_iteration();
-  bool run_wait_for_work(bool no_tiles);
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
 
-  void update_status_time(bool show_pause = false, bool show_done = false);
+  string_view get_render_tile_layer() const;
+  string_view get_render_tile_view() const;
 
-  void render(bool use_denoise);
-  void copy_to_display_buffer(int sample);
+  bool copy_render_tile_from_device();
 
-  void reset_(BufferParams &params, int samples);
+  bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels);
+  bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels);
 
-  void run_cpu();
-  bool draw_cpu(BufferParams &params, DeviceDrawParams &draw_params);
-  void reset_cpu(BufferParams &params, int samples);
+  /* --------------------------------------------------------------------
+   * Full-frame on-disk storage.
+   */
 
-  void run_gpu();
-  bool draw_gpu(BufferParams &params, DeviceDrawParams &draw_params);
-  void reset_gpu(BufferParams &params, int samples);
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
 
-  bool render_need_denoise(bool &delayed);
+ protected:
+  struct DelayedReset {
+    thread_mutex mutex;
+    bool do_reset;
+    SessionParams session_params;
+    BufferParams buffer_params;
+  } delayed_reset_;
 
-  bool steal_tile(RenderTile &tile, Device *tile_device, thread_scoped_lock &tile_lock);
-  bool get_tile_stolen();
-  bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
-  void update_tile_sample(RenderTile &tile);
-  void release_tile(RenderTile &tile, const bool need_denoise);
+  void run();
 
-  void map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
-  void unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+  /* Update for the new iteration of the main loop in run implementation (run_cpu and run_gpu).
+   *
+   * Will take care of the following things:
+   *  - Delayed reset
+   *  - Scene update
+   *  - Tile manager advance
+   *  - Render scheduler work request
+   *
+   * The updates are done in a proper order with proper locking around them, which guarantees
+   * that the device side of scene and render buffers are always in a consistent state.
+   *
+   * Returns render work which is to be rendered next. */
+  RenderWork run_update_for_next_iteration();
+
+  /* Wait for rendering to be unpaused, or for new tiles for render to arrive.
+   * Returns true if new main render loop iteration is required after this function call.
+   *
+   * The `render_work` is the work which was scheduled by the render scheduler right before
+   * checking the pause. */
+  bool run_wait_for_work(const RenderWork &render_work);
+
+  void run_main_render_loop();
+
+  bool update_scene(int width, int height);
 
-  bool device_use_gl_;
+  void update_status_time(bool show_pause = false, bool show_done = false);
 
-  thread *session_thread_;
+  void do_delayed_reset();
 
-  volatile bool display_outdated_;
+  int2 get_effective_tile_size() const;
 
-  volatile bool gpu_draw_ready_;
-  volatile bool gpu_need_display_buffer_update_;
-  thread_condition_variable gpu_need_display_buffer_update_cond_;
+  thread *session_thread_;
 
-  bool pause_;
-  bool cancel_;
-  bool new_work_added_;
+  bool pause_ = false;
+  bool cancel_ = false;
+  bool new_work_added_ = false;
 
   thread_condition_variable pause_cond_;
   thread_mutex pause_mutex_;
   thread_mutex tile_mutex_;
   thread_mutex buffers_mutex_;
-  thread_mutex display_mutex_;
-  thread_condition_variable denoising_cond_;
-  thread_condition_variable tile_steal_cond_;
-
-  double reset_time_;
-  double last_update_time_;
-  double last_display_time_;
-
-  RenderTile stolen_tile_;
-  typedef enum {
-    NOT_STEALING,     /* There currently is no tile stealing in progress. */
-    WAITING_FOR_TILE, /* A device is waiting for another device to release a tile. */
-    RELEASING_TILE,   /* A device has releasing a stealable tile. */
-    GOT_TILE /* A device has released a stealable tile, which is now stored in stolen_tile. */
-  } TileStealingState;
-  std::atomic<TileStealingState> tile_stealing_state_;
-  int stealable_tiles_;
-
-  /* progressive refine */
-  bool update_progressive_refine(bool cancel);
+
+  TileManager tile_manager_;
+  BufferParams buffer_params_;
+
+  /* Render scheduler is used to get work to be rendered with the current big tile. */
+  RenderScheduler render_scheduler_;
+
+  /* Path tracer object.
+   *
+   * Is a single full-frame path tracer for interactive viewport rendering.
+   * A path tracer for the current big-tile for an offline rendering. */
+  unique_ptr<PathTrace> path_trace_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 59b60904746..f6b23606e58 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -203,6 +203,7 @@ Shader::Shader() : Node(get_node_type())
   has_surface = false;
   has_surface_transparent = false;
   has_surface_emission = false;
+  has_surface_raytrace = false;
   has_surface_bssrdf = false;
   has_volume = false;
   has_displacement = false;
@@ -485,7 +486,7 @@ void ShaderManager::device_update(Device *device,
   device_update_specific(device, dscene, scene, progress);
 }
 
-void ShaderManager::device_update_common(Device *device,
+void ShaderManager::device_update_common(Device * /*device*/,
                                          DeviceScene *dscene,
                                          Scene *scene,
                                          Progress & /*progress*/)
@@ -508,6 +509,8 @@ void ShaderManager::device_update_common(Device *device,
       flag |= SD_HAS_EMISSION;
     if (shader->has_surface_transparent && shader->get_use_transparent_shadow())
       flag |= SD_HAS_TRANSPARENT_SHADOW;
+    if (shader->has_surface_raytrace)
+      flag |= SD_HAS_RAYTRACE;
     if (shader->has_volume) {
       flag |= SD_HAS_VOLUME;
       has_volumes = true;
@@ -528,12 +531,10 @@ void ShaderManager::device_update_common(Device *device,
       flag |= SD_NEED_VOLUME_ATTRIBUTES;
     if (shader->has_bssrdf_bump)
       flag |= SD_HAS_BSSRDF_BUMP;
-    if (device->info.has_volume_decoupled) {
-      if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
-        flag |= SD_VOLUME_EQUIANGULAR;
-      if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
-        flag |= SD_VOLUME_MIS;
-    }
+    if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
+      flag |= SD_VOLUME_EQUIANGULAR;
+    if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+      flag |= SD_VOLUME_MIS;
     if (shader->get_volume_interpolation_method() == VOLUME_INTERPOLATION_CUBIC)
       flag |= SD_VOLUME_CUBIC;
     if (shader->has_bump)
@@ -682,39 +683,35 @@ void ShaderManager::add_default(Scene *scene)
   }
 }
 
-void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
-                                                 DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_graph_kernel_features(ShaderGraph *graph)
 {
+  uint kernel_features = 0;
+
   foreach (ShaderNode *node, graph->nodes) {
-    requested_features->max_nodes_group = max(requested_features->max_nodes_group,
-                                              node->get_group());
-    requested_features->nodes_features |= node->get_feature();
+    kernel_features |= node->get_feature();
     if (node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
       BsdfBaseNode *bsdf_node = static_cast<BsdfBaseNode *>(node);
       if (CLOSURE_IS_VOLUME(bsdf_node->get_closure_type())) {
-        requested_features->nodes_features |= NODE_FEATURE_VOLUME;
+        kernel_features |= KERNEL_FEATURE_NODE_VOLUME;
       }
       else if (CLOSURE_IS_PRINCIPLED(bsdf_node->get_closure_type())) {
-        requested_features->use_principled = true;
+        kernel_features |= KERNEL_FEATURE_PRINCIPLED;
       }
     }
     if (node->has_surface_bssrdf()) {
-      requested_features->use_subsurface = true;
+      kernel_features |= KERNEL_FEATURE_SUBSURFACE;
     }
     if (node->has_surface_transparent()) {
-      requested_features->use_transparent = true;
-    }
-    if (node->has_raytrace()) {
-      requested_features->use_shader_raytrace = true;
+      kernel_features |= KERNEL_FEATURE_TRANSPARENT;
     }
   }
+
+  return kernel_features;
 }
 
-void ShaderManager::get_requested_features(Scene *scene,
-                                           DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_kernel_features(Scene *scene)
 {
-  requested_features->max_nodes_group = NODE_GROUP_LEVEL_0;
-  requested_features->nodes_features = 0;
+  uint kernel_features = KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION;
   for (int i = 0; i < scene->shaders.size(); i++) {
     Shader *shader = scene->shaders[i];
     if (!shader->reference_count()) {
@@ -722,21 +719,22 @@ void ShaderManager::get_requested_features(Scene *scene,
     }
 
     /* Gather requested features from all the nodes from the graph nodes. */
-    get_requested_graph_features(shader->graph, requested_features);
+    kernel_features |= get_graph_kernel_features(shader->graph);
     ShaderNode *output_node = shader->graph->output();
     if (output_node->input("Displacement")->link != NULL) {
-      requested_features->nodes_features |= NODE_FEATURE_BUMP;
+      kernel_features |= KERNEL_FEATURE_NODE_BUMP;
       if (shader->get_displacement_method() == DISPLACE_BOTH) {
-        requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE;
-        requested_features->max_nodes_group = max(requested_features->max_nodes_group,
-                                                  NODE_GROUP_LEVEL_1);
+        kernel_features |= KERNEL_FEATURE_NODE_BUMP_STATE;
       }
     }
     /* On top of volume nodes, also check if we need volume sampling because
-     * e.g. an Emission node would slip through the NODE_FEATURE_VOLUME check */
-    if (shader->has_volume)
-      requested_features->use_volume |= true;
+     * e.g. an Emission node would slip through the KERNEL_FEATURE_NODE_VOLUME check */
+    if (shader->has_volume) {
+      kernel_features |= KERNEL_FEATURE_VOLUME;
+    }
   }
+
+  return kernel_features;
 }
 
 void ShaderManager::free_memory()
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index c65cac351a4..5f9adea3949 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 class DeviceScene;
-class DeviceRequestedFeatures;
 class Mesh;
 class Progress;
 class Scene;
@@ -117,6 +116,7 @@ class Shader : public Node {
   bool has_surface;
   bool has_surface_emission;
   bool has_surface_transparent;
+  bool has_surface_raytrace;
   bool has_volume;
   bool has_displacement;
   bool has_surface_bssrdf;
@@ -216,7 +216,7 @@ class ShaderManager {
   static void add_default(Scene *scene);
 
   /* Selective nodes compilation. */
-  void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features);
+  uint get_kernel_features(Scene *scene);
 
   static void free_memory();
 
@@ -244,8 +244,7 @@ class ShaderManager {
 
   size_t beckmann_table_offset;
 
-  void get_requested_graph_features(ShaderGraph *graph,
-                                    DeviceRequestedFeatures *requested_features);
+  uint get_graph_kernel_features(ShaderGraph *graph);
 
   thread_spin_lock attribute_lock_;
 
diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp
index 2c6273842e2..73eb7e21ff9 100644
--- a/intern/cycles/render/stats.cpp
+++ b/intern/cycles/render/stats.cpp
@@ -264,53 +264,34 @@ void RenderStats::collect_profiling(Scene *scene, Profiler &prof)
   has_profiling = true;
 
   kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
-
   kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
-  kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
-
-  NamedNestedSampleStats &integrator = kernel.add_entry("Path integration",
-                                                        prof.get_event(PROFILING_PATH_INTEGRATE));
-  integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
-  integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
-  integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
-
-  NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
-  shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
-  shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
-  shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
-  shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
-  shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
-
-  integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
-  integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
-
-  NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
-  intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
-  intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
-  intersection.add_entry("Shadow All Intersection",
-                         prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
-  intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
-  intersection.add_entry("Volume All Intersection",
-                         prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
-
-  NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
-  closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
-  closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
-  closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
-  closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
-
-  NamedNestedSampleStats &denoising = kernel.add_entry("Denoising",
-                                                       prof.get_event(PROFILING_DENOISING));
-  denoising.add_entry("Construct Transform",
-                      prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
-  denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
-
-  NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
-  prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
-  prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
-  prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
-  prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
-  prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+  kernel.add_entry("Intersect Closest", prof.get_event(PROFILING_INTERSECT_CLOSEST));
+  kernel.add_entry("Intersect Shadow", prof.get_event(PROFILING_INTERSECT_SHADOW));
+  kernel.add_entry("Intersect Subsurface", prof.get_event(PROFILING_INTERSECT_SUBSURFACE));
+  kernel.add_entry("Intersect Volume Stack", prof.get_event(PROFILING_INTERSECT_VOLUME_STACK));
+
+  NamedNestedSampleStats &surface = kernel.add_entry("Shade Surface", 0);
+  surface.add_entry("Setup", prof.get_event(PROFILING_SHADE_SURFACE_SETUP));
+  surface.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_SURFACE_EVAL));
+  surface.add_entry("Render Passes", prof.get_event(PROFILING_SHADE_SURFACE_PASSES));
+  surface.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_SURFACE_DIRECT_LIGHT));
+  surface.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT));
+  surface.add_entry("Ambient Occlusion", prof.get_event(PROFILING_SHADE_SURFACE_AO));
+
+  NamedNestedSampleStats &volume = kernel.add_entry("Shade Volume", 0);
+  volume.add_entry("Setup", prof.get_event(PROFILING_SHADE_VOLUME_SETUP));
+  volume.add_entry("Integrate", prof.get_event(PROFILING_SHADE_VOLUME_INTEGRATE));
+  volume.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_VOLUME_DIRECT_LIGHT));
+  volume.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_VOLUME_INDIRECT_LIGHT));
+
+  NamedNestedSampleStats &shadow = kernel.add_entry("Shade Shadow", 0);
+  shadow.add_entry("Setup", prof.get_event(PROFILING_SHADE_SHADOW_SETUP));
+  shadow.add_entry("Surface", prof.get_event(PROFILING_SHADE_SHADOW_SURFACE));
+  shadow.add_entry("Volume", prof.get_event(PROFILING_SHADE_SHADOW_VOLUME));
+
+  NamedNestedSampleStats &light = kernel.add_entry("Shade Light", 0);
+  light.add_entry("Setup", prof.get_event(PROFILING_SHADE_LIGHT_SETUP));
+  light.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_LIGHT_EVAL));
 
   shaders.entries.clear();
   foreach (Shader *shader, scene->shaders) {
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index dcb3976e15c..2379eb775a0 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -446,6 +446,8 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done)
   if (current_type == SHADER_TYPE_SURFACE) {
     if (node->has_spatial_varying())
       current_shader->has_surface_spatial_varying = true;
+    if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+      current_shader->has_surface_raytrace = true;
   }
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
@@ -492,6 +494,13 @@ void SVMCompiler::generate_svm_nodes(const ShaderNodeSet &nodes, CompilerState *
 
 void SVMCompiler::generate_closure_node(ShaderNode *node, CompilerState *state)
 {
+  /* Skip generating closure that are not supported or needed for a particular
+   * type of shader. For example a BSDF in a volume shader. */
+  const int node_feature = node->get_feature();
+  if ((state->node_feature_mask & node_feature) != node_feature) {
+    return;
+  }
+
   /* execute dependencies for closure */
   foreach (ShaderInput *in, node->inputs) {
     if (in->link != NULL) {
@@ -555,7 +564,7 @@ void SVMCompiler::find_aov_nodes_and_dependencies(ShaderNodeSet &aov_nodes,
   foreach (ShaderNode *node, graph->nodes) {
     if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
       OutputAOVNode *aov_node = static_cast<OutputAOVNode *>(node);
-      if (aov_node->slot >= 0) {
+      if (aov_node->offset >= 0) {
         aov_nodes.insert(aov_node);
         foreach (ShaderInput *in, node->inputs) {
           if (in->link != NULL) {
@@ -785,17 +794,21 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
         case SHADER_TYPE_SURFACE: /* generate surface shader */
           generate = true;
           shader->has_surface = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE;
           break;
         case SHADER_TYPE_VOLUME: /* generate volume shader */
           generate = true;
           shader->has_volume = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_VOLUME;
           break;
         case SHADER_TYPE_DISPLACEMENT: /* generate displacement shader */
           generate = true;
           shader->has_displacement = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_DISPLACEMENT;
           break;
         case SHADER_TYPE_BUMP: /* generate bump shader */
           generate = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_BUMP;
           break;
         default:
           break;
@@ -867,6 +880,7 @@ void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Sum
   shader->has_surface = false;
   shader->has_surface_emission = false;
   shader->has_surface_transparent = false;
+  shader->has_surface_raytrace = false;
   shader->has_surface_bssrdf = false;
   shader->has_bump = has_bump;
   shader->has_bssrdf_bump = has_bump;
@@ -964,6 +978,7 @@ SVMCompiler::CompilerState::CompilerState(ShaderGraph *graph)
     max_id = max(node->id, max_id);
   }
   nodes_done_flag.resize(max_id + 1, false);
+  node_feature_mask = 0;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index d23ff3e2a47..0353c393ae4 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -192,6 +192,9 @@ class SVMCompiler {
      * all areas to use this flags array.
      */
     vector<bool> nodes_done_flag;
+
+    /* Node features that can be compiled. */
+    uint node_feature_mask;
   };
 
   void stack_clear_temporary(ShaderNode *node);
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 375c9fd8e09..eed75cc2372 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -16,601 +16,559 @@
 
 #include "render/tile.h"
 
+#include <atomic>
+
+#include "graph/node.h"
+#include "render/background.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/scene.h"
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
 #include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-namespace {
+/* --------------------------------------------------------------------
+ * Internal functions.
+ */
 
-class TileComparator {
- public:
-  TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
-      : order(order_), center(center_), tiles(tiles_)
-  {
-  }
+static const char *ATTR_PASSES_COUNT = "cycles.passes.count";
+static const char *ATTR_PASS_SOCKET_PREFIX_FORMAT = "cycles.passes.%d.";
+static const char *ATTR_BUFFER_SOCKET_PREFIX = "cycles.buffer.";
+static const char *ATTR_DENOISE_SOCKET_PREFIX = "cycles.denoise.";
 
-  bool operator()(int a, int b)
-  {
-    switch (order) {
-      case TILE_CENTER: {
-        float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w / 2),
-                                    center.y - (tiles[a].y + tiles[a].h / 2));
-        float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w / 2),
-                                    center.y - (tiles[b].y + tiles[b].h / 2));
-        return dot(dist_a, dist_a) < dot(dist_b, dist_b);
-      }
-      case TILE_LEFT_TO_RIGHT:
-        return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x < tiles[b].x);
-      case TILE_RIGHT_TO_LEFT:
-        return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x > tiles[b].x);
-      case TILE_TOP_TO_BOTTOM:
-        return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y > tiles[b].y);
-      case TILE_BOTTOM_TO_TOP:
-      default:
-        return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y < tiles[b].y);
+/* Global counter of ToleManager object instances. */
+static std::atomic<uint64_t> g_instance_index = 0;
+
+/* Construct names of EXR channels which will ensure order of all channels to match exact offsets
+ * in render buffers corresponding to the given passes.
+ *
+ * Returns `std` datatypes so that it can be assigned directly to the OIIO's `ImageSpec`. */
+static std::vector<std::string> exr_channel_names_for_passes(const BufferParams &buffer_params)
+{
+  static const char *component_suffixes[] = {"R", "G", "B", "A"};
+
+  int pass_index = 0;
+  int num_channels = 0;
+  std::vector<std::string> channel_names;
+  for (const BufferPass &pass : buffer_params.passes) {
+    if (pass.offset == PASS_UNUSED) {
+      continue;
     }
-  }
 
- protected:
-  TileOrder order;
-  int2 center;
-  Tile *tiles;
-};
+    const PassInfo pass_info = pass.get_info();
+    num_channels += pass_info.num_components;
 
-inline int2 hilbert_index_to_pos(int n, int d)
-{
-  int2 r, xy = make_int2(0, 0);
-  for (int s = 1; s < n; s *= 2) {
-    r.x = (d >> 1) & 1;
-    r.y = (d ^ r.x) & 1;
-    if (!r.y) {
-      if (r.x) {
-        xy = make_int2(s - 1, s - 1) - xy;
-      }
-      swap(xy.x, xy.y);
+    /* EXR canonically expects first part of channel names to be sorted alphabetically, which is
+     * not guaranteed to be the case with passes names. Assign a prefix based on the pass index
+     * with a fixed width to ensure ordering. This makes it possible to dump existing render
+     * buffers memory to disk and read it back without doing extra mapping. */
+    const string prefix = string_printf("%08d", pass_index);
+
+    const string channel_name_prefix = prefix + string(pass.name) + ".";
+
+    for (int i = 0; i < pass_info.num_components; ++i) {
+      channel_names.push_back(channel_name_prefix + component_suffixes[i]);
     }
-    xy += r * make_int2(s, s);
-    d >>= 2;
+
+    ++pass_index;
   }
-  return xy;
+
+  return channel_names;
 }
 
-enum SpiralDirection {
-  DIRECTION_UP,
-  DIRECTION_LEFT,
-  DIRECTION_DOWN,
-  DIRECTION_RIGHT,
-};
-
-} /* namespace */
-
-TileManager::TileManager(bool progressive_,
-                         int num_samples_,
-                         int2 tile_size_,
-                         int start_resolution_,
-                         bool preserve_tile_device_,
-                         bool background_,
-                         TileOrder tile_order_,
-                         int num_devices_,
-                         int pixel_size_)
+inline string node_socket_attribute_name(const SocketType &socket, const string &attr_name_prefix)
 {
-  progressive = progressive_;
-  tile_size = tile_size_;
-  tile_order = tile_order_;
-  start_resolution = start_resolution_;
-  pixel_size = pixel_size_;
-  slice_overlap = 0;
-  num_samples = num_samples_;
-  num_devices = num_devices_;
-  preserve_tile_device = preserve_tile_device_;
-  background = background_;
-  schedule_denoising = false;
-
-  range_start_sample = 0;
-  range_num_samples = -1;
-
-  BufferParams buffer_params;
-  reset(buffer_params, 0);
+  return attr_name_prefix + string(socket.name);
 }
 
-TileManager::~TileManager()
+template<typename ValidateValueFunc, typename GetValueFunc>
+static bool node_socket_generic_to_image_spec_atttributes(
+    ImageSpec *image_spec,
+    const Node *node,
+    const SocketType &socket,
+    const string &attr_name_prefix,
+    const ValidateValueFunc &validate_value_func,
+    const GetValueFunc &get_value_func)
 {
+  if (!validate_value_func(node, socket)) {
+    return false;
+  }
+
+  image_spec->attribute(node_socket_attribute_name(socket, attr_name_prefix),
+                        get_value_func(node, socket));
+
+  return true;
 }
 
-void TileManager::device_free()
+static bool node_socket_to_image_spec_atttributes(ImageSpec *image_spec,
+                                                  const Node *node,
+                                                  const SocketType &socket,
+                                                  const string &attr_name_prefix)
 {
-  if (schedule_denoising || progressive) {
-    for (int i = 0; i < state.tiles.size(); i++) {
-      delete state.tiles[i].buffers;
-      state.tiles[i].buffers = NULL;
+  const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+  switch (socket.type) {
+    case SocketType::ENUM: {
+      const ustring value = node->get_string(socket);
+
+      /* Validate that the node is consistent with the node type definition. */
+      const NodeEnum &enum_values = *socket.enum_values;
+      if (!enum_values.exists(value)) {
+        LOG(DFATAL) << "Node enum contains invalid value " << value;
+        return false;
+      }
+
+      image_spec->attribute(attr_name, value);
+
+      return true;
     }
-  }
 
-  state.tiles.clear();
+    case SocketType::STRING:
+      image_spec->attribute(attr_name, node->get_string(socket));
+      return true;
+
+    case SocketType::INT:
+      image_spec->attribute(attr_name, node->get_int(socket));
+      return true;
+
+    case SocketType::FLOAT:
+      image_spec->attribute(attr_name, node->get_float(socket));
+      return true;
+
+    case SocketType::BOOLEAN:
+      image_spec->attribute(attr_name, node->get_bool(socket));
+      return true;
+
+    default:
+      LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+      return false;
+  }
 }
 
-static int get_divider(int w, int h, int start_resolution)
+static bool node_socket_from_image_spec_atttributes(Node *node,
+                                                    const SocketType &socket,
+                                                    const ImageSpec &image_spec,
+                                                    const string &attr_name_prefix)
 {
-  int divider = 1;
-  if (start_resolution != INT_MAX) {
-    while (w * h > start_resolution * start_resolution) {
-      w = max(1, w / 2);
-      h = max(1, h / 2);
+  const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+  switch (socket.type) {
+    case SocketType::ENUM: {
+      /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+      const ustring value(image_spec.get_string_attribute(attr_name, ""));
+
+      /* Validate that the node is consistent with the node type definition. */
+      const NodeEnum &enum_values = *socket.enum_values;
+      if (!enum_values.exists(value)) {
+        LOG(ERROR) << "Invalid enumerator value " << value;
+        return false;
+      }
 
-      divider <<= 1;
+      node->set(socket, enum_values[value]);
+
+      return true;
     }
+
+    case SocketType::STRING:
+      /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+      node->set(socket, ustring(image_spec.get_string_attribute(attr_name, "")));
+      return true;
+
+    case SocketType::INT:
+      node->set(socket, image_spec.get_int_attribute(attr_name, 0));
+      return true;
+
+    case SocketType::FLOAT:
+      node->set(socket, image_spec.get_float_attribute(attr_name, 0));
+      return true;
+
+    case SocketType::BOOLEAN:
+      node->set(socket, static_cast<bool>(image_spec.get_int_attribute(attr_name, 0)));
+      return true;
+
+    default:
+      LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+      return false;
   }
-  return divider;
 }
 
-void TileManager::reset(BufferParams &params_, int num_samples_)
+static bool node_to_image_spec_atttributes(ImageSpec *image_spec,
+                                           const Node *node,
+                                           const string &attr_name_prefix)
 {
-  params = params_;
-
-  set_samples(num_samples_);
-
-  state.buffer = BufferParams();
-  state.sample = range_start_sample - 1;
-  state.num_tiles = 0;
-  state.num_samples = 0;
-  state.resolution_divider = get_divider(params.width, params.height, start_resolution);
-  state.render_tiles.clear();
-  state.denoising_tiles.clear();
-  device_free();
+  for (const SocketType &socket : node->type->inputs) {
+    if (!node_socket_to_image_spec_atttributes(image_spec, node, socket, attr_name_prefix)) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
-void TileManager::set_samples(int num_samples_)
+static bool node_from_image_spec_atttributes(Node *node,
+                                             const ImageSpec &image_spec,
+                                             const string &attr_name_prefix)
 {
-  num_samples = num_samples_;
+  for (const SocketType &socket : node->type->inputs) {
+    if (!node_socket_from_image_spec_atttributes(node, socket, image_spec, attr_name_prefix)) {
+      return false;
+    }
+  }
+
+  return true;
+}
 
-  /* No real progress indication is possible when using unlimited samples. */
-  if (num_samples == INT_MAX) {
-    state.total_pixel_samples = 0;
+static bool buffer_params_to_image_spec_atttributes(ImageSpec *image_spec,
+                                                    const BufferParams &buffer_params)
+{
+  if (!node_to_image_spec_atttributes(image_spec, &buffer_params, ATTR_BUFFER_SOCKET_PREFIX)) {
+    return false;
   }
-  else {
-    uint64_t pixel_samples = 0;
-    /* While rendering in the viewport, the initial preview resolution is increased to the native
-     * resolution before the actual rendering begins. Therefore, additional pixel samples will be
-     * rendered. */
-    int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
-    while (divider > pixel_size) {
-      int image_w = max(1, params.width / divider);
-      int image_h = max(1, params.height / divider);
-      pixel_samples += image_w * image_h;
-      divider >>= 1;
-    }
 
-    int image_w = max(1, params.width / divider);
-    int image_h = max(1, params.height / divider);
-    state.total_pixel_samples = pixel_samples +
-                                (uint64_t)get_num_effective_samples() * image_w * image_h;
-    if (schedule_denoising) {
-      state.total_pixel_samples += params.width * params.height;
+  /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+  const int num_passes = buffer_params.passes.size();
+  image_spec->attribute(ATTR_PASSES_COUNT, num_passes);
+
+  for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+    const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
+
+    const BufferPass *pass = &buffer_params.passes[pass_index];
+    if (!node_to_image_spec_atttributes(image_spec, pass, attr_name_prefix)) {
+      return false;
     }
   }
+
+  return true;
 }
 
-/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render
- * device. If sliced is true, slice image into as much pieces as how many devices are rendering
- * this image. */
-int TileManager::gen_tiles(bool sliced)
+static bool buffer_params_from_image_spec_atttributes(BufferParams *buffer_params,
+                                                      const ImageSpec &image_spec)
 {
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
-  int2 center = make_int2(image_w / 2, image_h / 2);
-
-  int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1;
-  int slice_num = sliced ? num : 1;
-  int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-
-  device_free();
-  state.render_tiles.clear();
-  state.denoising_tiles.clear();
-  state.render_tiles.resize(num);
-  state.denoising_tiles.resize(num);
-  state.tile_stride = tile_w;
-  vector<list<int>>::iterator tile_list;
-  tile_list = state.render_tiles.begin();
-
-  if (tile_order == TILE_HILBERT_SPIRAL) {
-    assert(!sliced && slice_overlap == 0);
-
-    int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
-    state.tiles.resize(tile_w * tile_h);
-
-    /* Size of blocks in tiles, must be a power of 2 */
-    const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12) ? 8 : 4;
-
-    int tiles_per_device = divide_up(tile_w * tile_h, num);
-    int cur_device = 0, cur_tiles = 0;
-
-    int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
-    /* Number of blocks to fill the image */
-    int blocks_x = (block_size.x >= image_w) ? 1 : divide_up(image_w, block_size.x);
-    int blocks_y = (block_size.y >= image_h) ? 1 : divide_up(image_h, block_size.y);
-    int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
-    /* Offset of spiral (to keep it centered) */
-    int2 offset = make_int2((image_w - n * block_size.x) / 2, (image_h - n * block_size.y) / 2);
-    offset = (offset / tile_size) * tile_size; /* Round to tile border. */
-
-    int2 block = make_int2(0, 0); /* Current block */
-    SpiralDirection prev_dir = DIRECTION_UP, dir = DIRECTION_UP;
-    for (int i = 0;;) {
-      /* Generate the tiles in the current block. */
-      for (int hilbert_index = 0; hilbert_index < hilbert_size * hilbert_size; hilbert_index++) {
-        int2 tile, hilbert_pos = hilbert_index_to_pos(hilbert_size, hilbert_index);
-        /* Rotate block according to spiral direction. */
-        if (prev_dir == DIRECTION_UP && dir == DIRECTION_UP) {
-          tile = make_int2(hilbert_pos.y, hilbert_pos.x);
-        }
-        else if (dir == DIRECTION_LEFT || prev_dir == DIRECTION_LEFT) {
-          tile = hilbert_pos;
-        }
-        else if (dir == DIRECTION_DOWN) {
-          tile = make_int2(hilbert_size - 1 - hilbert_pos.y, hilbert_size - 1 - hilbert_pos.x);
-        }
-        else {
-          tile = make_int2(hilbert_size - 1 - hilbert_pos.x, hilbert_size - 1 - hilbert_pos.y);
-        }
-
-        int2 pos = block * block_size + tile * tile_size + offset;
-        /* Only add tiles which are in the image (tiles outside of the image can be generated since
-         * the spiral is always square). */
-        if (pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
-          int w = min(tile_size.x, image_w - pos.x);
-          int h = min(tile_size.y, image_h - pos.y);
-          int2 ipos = pos / tile_size;
-          int idx = ipos.y * tile_w + ipos.x;
-          state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
-          tile_list->push_front(idx);
-          cur_tiles++;
-
-          if (cur_tiles == tiles_per_device) {
-            tile_list++;
-            cur_tiles = 0;
-            cur_device++;
-          }
-        }
-      }
+  if (!node_from_image_spec_atttributes(buffer_params, image_spec, ATTR_BUFFER_SOCKET_PREFIX)) {
+    return false;
+  }
 
-      /* Stop as soon as the spiral has reached the center block. */
-      if (block.x == (n - 1) / 2 && block.y == (n - 1) / 2)
-        break;
-
-      /* Advance to next block. */
-      prev_dir = dir;
-      switch (dir) {
-        case DIRECTION_UP:
-          block.y++;
-          if (block.y == (n - i - 1)) {
-            dir = DIRECTION_LEFT;
-          }
-          break;
-        case DIRECTION_LEFT:
-          block.x++;
-          if (block.x == (n - i - 1)) {
-            dir = DIRECTION_DOWN;
-          }
-          break;
-        case DIRECTION_DOWN:
-          block.y--;
-          if (block.y == i) {
-            dir = DIRECTION_RIGHT;
-          }
-          break;
-        case DIRECTION_RIGHT:
-          block.x--;
-          if (block.x == i + 1) {
-            dir = DIRECTION_UP;
-            i++;
-          }
-          break;
-      }
-    }
-    return tile_w * tile_h;
+  /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+  const int num_passes = image_spec.get_int_attribute(ATTR_PASSES_COUNT, 0);
+  if (num_passes == 0) {
+    LOG(ERROR) << "Missing passes count attribute.";
+    return false;
   }
 
-  int idx = 0;
-  for (int slice = 0; slice < slice_num; slice++) {
-    int slice_y = (image_h / slice_num) * slice;
-    int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) :
-                                             image_h / slice_num;
+  for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+    const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
 
-    if (slice_overlap != 0) {
-      int slice_y_offset = max(slice_y - slice_overlap, 0);
-      slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset;
-      slice_y = slice_y_offset;
-    }
+    BufferPass pass;
 
-    int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
-    int tiles_per_device = divide_up(tile_w * tile_h, num);
-    int cur_device = 0, cur_tiles = 0;
-
-    for (int tile_y = 0; tile_y < tile_h; tile_y++) {
-      for (int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
-        int x = tile_x * tile_size.x;
-        int y = tile_y * tile_size.y;
-        int w = (tile_x == tile_w - 1) ? image_w - x : tile_size.x;
-        int h = (tile_y == tile_h - 1) ? slice_h - y : tile_size.y;
-
-        state.tiles.push_back(
-            Tile(idx, x, y + slice_y, w, h, sliced ? slice : cur_device, Tile::RENDER));
-        tile_list->push_back(idx);
-
-        if (!sliced) {
-          cur_tiles++;
-
-          if (cur_tiles == tiles_per_device) {
-            /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that
-             * case. */
-            if (tile_order != TILE_BOTTOM_TO_TOP) {
-              tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
-            }
-            tile_list++;
-            cur_tiles = 0;
-            cur_device++;
-          }
-        }
-      }
-    }
-    if (sliced) {
-      tile_list++;
+    if (!node_from_image_spec_atttributes(&pass, image_spec, attr_name_prefix)) {
+      return false;
     }
+
+    buffer_params->passes.emplace_back(std::move(pass));
   }
 
-  return idx;
+  buffer_params->update_passes();
+
+  return true;
 }
 
-void TileManager::gen_render_tiles()
+/* Configure image specification for the given buffer parameters and passes.
+ *
+ * Image channels will ber strictly ordered to match content of corresponding buffer, and the
+ * metadata will be set so that the render buffers and passes can be reconstructed from it.
+ *
+ * If the tile size different from (0, 0) the image specification will be configured to use the
+ * given tile size for tiled IO. */
+static bool configure_image_spec_from_buffer(ImageSpec *image_spec,
+                                             const BufferParams &buffer_params,
+                                             const int2 tile_size = make_int2(0, 0))
 {
-  /* Regenerate just the render tiles for progressive render. */
-  foreach (Tile &tile, state.tiles) {
-    tile.state = Tile::RENDER;
-    state.render_tiles[tile.device].push_back(tile.index);
+  const std::vector<std::string> channel_names = exr_channel_names_for_passes(buffer_params);
+  const int num_channels = channel_names.size();
+
+  *image_spec = ImageSpec(
+      buffer_params.width, buffer_params.height, num_channels, TypeDesc::FLOAT);
+
+  image_spec->channelnames = move(channel_names);
+
+  if (!buffer_params_to_image_spec_atttributes(image_spec, buffer_params)) {
+    return false;
+  }
+
+  if (tile_size.x != 0 || tile_size.y != 0) {
+    DCHECK_GT(tile_size.x, 0);
+    DCHECK_GT(tile_size.y, 0);
+
+    image_spec->tile_width = tile_size.x;
+    image_spec->tile_height = tile_size.y;
   }
+
+  return true;
 }
 
-void TileManager::set_tiles()
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
+
+TileManager::TileManager()
 {
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
+  /* Use process ID to separate different processes.
+   * To ensure uniqueness from within a process use combination of object address and instance
+   * index. This solves problem of possible object re-allocation at the same time, and solves
+   * possible conflict when the counter overflows while there are still active instances of the
+   * class. */
+  const int tile_manager_id = g_instance_index.fetch_add(1, std::memory_order_relaxed);
+  tile_file_unique_part_ = to_string(system_self_process_id()) + "-" +
+                           to_string(reinterpret_cast<uintptr_t>(this)) + "-" +
+                           to_string(tile_manager_id);
+}
 
-  state.num_tiles = gen_tiles(!background);
+TileManager::~TileManager()
+{
+}
+
+void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
+{
+  VLOG(3) << "Using tile size of " << tile_size;
+
+  close_tile_output();
+
+  tile_size_ = tile_size;
+
+  tile_state_.num_tiles_x = divide_up(params.width, tile_size_.x);
+  tile_state_.num_tiles_y = divide_up(params.height, tile_size_.y);
+  tile_state_.num_tiles = tile_state_.num_tiles_x * tile_state_.num_tiles_y;
+
+  tile_state_.next_tile_index = 0;
+
+  tile_state_.current_tile = Tile();
+}
+
+void TileManager::update(const BufferParams &params, const Scene *scene)
+{
+  DCHECK_NE(params.pass_stride, -1);
+
+  buffer_params_ = params;
 
-  state.buffer.width = image_w;
-  state.buffer.height = image_h;
+  /* TODO(sergey): Proper Error handling, so that if configuration has failed we dont' attempt to
+   * write to a partially configured file. */
+  configure_image_spec_from_buffer(&write_state_.image_spec, buffer_params_, tile_size_);
 
-  state.buffer.full_x = params.full_x / resolution;
-  state.buffer.full_y = params.full_y / resolution;
-  state.buffer.full_width = max(1, params.full_width / resolution);
-  state.buffer.full_height = max(1, params.full_height / resolution);
+  const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+  node_to_image_spec_atttributes(
+      &write_state_.image_spec, &denoise_params, ATTR_DENOISE_SOCKET_PREFIX);
 }
 
-int TileManager::get_neighbor_index(int index, int neighbor)
+bool TileManager::done()
 {
-  /* Neighbor indices:
-   *   0 1 2
-   *   3 4 5
-   *   6 7 8
-   */
-  static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1};
-  static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1};
-
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
-
-  int num = min(image_h, num_devices);
-  int slice_num = !background ? num : 1;
-  int slice_h = image_h / slice_num;
-
-  int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-  int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
-  /* Tiles in the state tile list are always indexed from left to right, top to bottom. */
-  int nx = (index % tile_w) + dx[neighbor];
-  int ny = (index / tile_w) + dy[neighbor];
-  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num)
-    return -1;
-
-  return ny * state.tile_stride + nx;
+  return tile_state_.next_tile_index == tile_state_.num_tiles;
 }
 
-/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state
- * min_state. */
-bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+bool TileManager::next()
 {
-  if (index < 0 || state.tiles[index].state < min_state) {
+  if (done()) {
     return false;
   }
-  for (int neighbor = 0; neighbor < 9; neighbor++) {
-    int nindex = get_neighbor_index(index, neighbor);
-    /* Out-of-bounds tiles don't matter. */
-    if (nindex >= 0 && state.tiles[nindex].state < min_state) {
-      return false;
-    }
-  }
+
+  tile_state_.current_tile = get_tile_for_index(tile_state_.next_tile_index);
+
+  ++tile_state_.next_tile_index;
 
   return true;
 }
 
-/* Returns whether the tile should be written (and freed if no denoising is used) instead of
- * updating. */
-bool TileManager::finish_tile(const int index, const bool need_denoise, bool &delete_tile)
+Tile TileManager::get_tile_for_index(int index) const
 {
-  delete_tile = false;
-
-  switch (state.tiles[index].state) {
-    case Tile::RENDER: {
-      if (!(schedule_denoising && need_denoise)) {
-        state.tiles[index].state = Tile::DONE;
-        delete_tile = !progressive;
-        return true;
-      }
-      state.tiles[index].state = Tile::RENDERED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been
-       * rendered. If yes, it can be denoised. */
-      for (int neighbor = 0; neighbor < 9; neighbor++) {
-        int nindex = get_neighbor_index(index, neighbor);
-        if (check_neighbor_state(nindex, Tile::RENDERED)) {
-          state.tiles[nindex].state = Tile::DENOISE;
-          state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
-        }
-      }
-      return false;
-    }
-    case Tile::DENOISE: {
-      state.tiles[index].state = Tile::DENOISED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been
-       * denoised. If yes, it can be freed. */
-      for (int neighbor = 0; neighbor < 9; neighbor++) {
-        int nindex = get_neighbor_index(index, neighbor);
-        if (check_neighbor_state(nindex, Tile::DENOISED)) {
-          state.tiles[nindex].state = Tile::DONE;
-          /* Do not delete finished tiles in progressive mode. */
-          if (!progressive) {
-            /* It can happen that the tile just finished denoising and already can be freed here.
-             * However, in that case it still has to be written before deleting, so we can't delete
-             * it yet. */
-            if (neighbor == 4) {
-              delete_tile = true;
-            }
-            else {
-              delete state.tiles[nindex].buffers;
-              state.tiles[nindex].buffers = NULL;
-            }
-          }
-        }
-      }
-      return true;
-    }
-    default:
-      assert(false);
-      return true;
+  /* TODO(sergey): Consider using hilbert spiral, or. maybe, even configurable. Not sure this
+   * brings a lot of value since this is only applicable to BIG tiles. */
+
+  const int tile_y = index / tile_state_.num_tiles_x;
+  const int tile_x = index - tile_y * tile_state_.num_tiles_x;
+
+  Tile tile;
+
+  tile.x = tile_x * tile_size_.x;
+  tile.y = tile_y * tile_size_.y;
+  tile.width = tile_size_.x;
+  tile.height = tile_size_.y;
+
+  tile.width = min(tile.width, buffer_params_.width - tile.x);
+  tile.height = min(tile.height, buffer_params_.height - tile.y);
+
+  return tile;
+}
+
+const Tile &TileManager::get_current_tile() const
+{
+  return tile_state_.current_tile;
+}
+
+bool TileManager::open_tile_output()
+{
+  write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
+                                        to_string(write_state_.tile_file_index) + ".exr");
+
+  write_state_.tile_out = ImageOutput::create(write_state_.filename);
+  if (!write_state_.tile_out) {
+    LOG(ERROR) << "Error creating image output for " << write_state_.filename;
+    return false;
+  }
+
+  if (!write_state_.tile_out->supports("tiles")) {
+    LOG(ERROR) << "Progress tile file format does not support tiling.";
+    return false;
   }
+
+  write_state_.tile_out->open(write_state_.filename, write_state_.image_spec);
+  write_state_.num_tiles_written = 0;
+
+  VLOG(3) << "Opened tile file " << write_state_.filename;
+
+  return true;
 }
 
-bool TileManager::next_tile(Tile *&tile, int device, uint tile_types)
+bool TileManager::close_tile_output()
 {
-  /* Preserve device if requested, unless this is a separate denoising device that just wants to
-   * grab any available tile. */
-  const bool preserve_device = preserve_tile_device && device < num_devices;
-
-  if (tile_types & RenderTile::DENOISE) {
-    int tile_index = -1;
-    int logical_device = preserve_device ? device : 0;
-
-    while (logical_device < state.denoising_tiles.size()) {
-      if (state.denoising_tiles[logical_device].empty()) {
-        if (preserve_device) {
-          break;
-        }
-        else {
-          logical_device++;
-          continue;
-        }
-      }
+  if (!write_state_.tile_out) {
+    return true;
+  }
 
-      tile_index = state.denoising_tiles[logical_device].front();
-      state.denoising_tiles[logical_device].pop_front();
-      break;
-    }
+  const bool success = write_state_.tile_out->close();
+  write_state_.tile_out = nullptr;
 
-    if (tile_index >= 0) {
-      tile = &state.tiles[tile_index];
-      return true;
-    }
+  if (!success) {
+    LOG(ERROR) << "Error closing tile file.";
+    return false;
   }
 
-  if (tile_types & RenderTile::PATH_TRACE) {
-    int tile_index = -1;
-    int logical_device = preserve_device ? device : 0;
-
-    while (logical_device < state.render_tiles.size()) {
-      if (state.render_tiles[logical_device].empty()) {
-        if (preserve_device) {
-          break;
-        }
-        else {
-          logical_device++;
-          continue;
-        }
-      }
+  VLOG(3) << "Tile output is closed.";
 
-      tile_index = state.render_tiles[logical_device].front();
-      state.render_tiles[logical_device].pop_front();
-      break;
+  return true;
+}
+
+bool TileManager::write_tile(const RenderBuffers &tile_buffers)
+{
+  if (!write_state_.tile_out) {
+    if (!open_tile_output()) {
+      return false;
     }
+  }
 
-    if (tile_index >= 0) {
-      tile = &state.tiles[tile_index];
-      return true;
+  DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride);
+
+  const BufferParams &tile_params = tile_buffers.params;
+
+  vector<float> pixel_storage;
+  const float *pixels = tile_buffers.buffer.data();
+
+  /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with
+   * empty pixels for tiles which are on the image boundary. */
+  if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) {
+    const int64_t pass_stride = tile_params.pass_stride;
+    const int64_t src_row_stride = tile_params.width * pass_stride;
+
+    const int64_t dst_row_stride = tile_size_.x * pass_stride;
+    pixel_storage.resize(dst_row_stride * tile_size_.y);
+
+    const float *src = tile_buffers.buffer.data();
+    float *dst = pixel_storage.data();
+    pixels = dst;
+
+    for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) {
+      memcpy(dst, src, src_row_stride * sizeof(float));
     }
   }
 
-  return false;
-}
+  const int tile_x = tile_params.full_x - buffer_params_.full_x;
+  const int tile_y = tile_params.full_y - buffer_params_.full_y;
 
-bool TileManager::done()
-{
-  int end_sample = (range_num_samples == -1) ? num_samples :
-                                               range_start_sample + range_num_samples;
-  return (state.resolution_divider == pixel_size) &&
-         (state.sample + state.num_samples >= end_sample);
+  VLOG(3) << "Write tile at " << tile_x << ", " << tile_y;
+  if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) {
+    LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror();
+  }
+
+  ++write_state_.num_tiles_written;
+
+  return true;
 }
 
-bool TileManager::has_tiles()
+void TileManager::finish_write_tiles()
 {
-  foreach (Tile &tile, state.tiles) {
-    if (tile.state != Tile::DONE) {
-      return true;
+  if (!write_state_.tile_out) {
+    /* None of the tiles were written hence the file was not created.
+     * Avoid creation of fully empty file since it is redundant. */
+    return;
+  }
+
+  /* EXR expects all tiles to present in file. So explicitly write missing tiles as all-zero. */
+  if (write_state_.num_tiles_written < tile_state_.num_tiles) {
+    vector<float> pixel_storage(tile_size_.x * tile_size_.y * buffer_params_.pass_stride);
+
+    for (int tile_index = write_state_.num_tiles_written; tile_index < tile_state_.num_tiles;
+         ++tile_index) {
+      const Tile tile = get_tile_for_index(tile_index);
+
+      VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y;
+
+      write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data());
     }
   }
-  return false;
+
+  close_tile_output();
+
+  if (full_buffer_written_cb) {
+    full_buffer_written_cb(write_state_.filename);
+  }
+
+  /* Advance the counter upon explicit finish of the file.
+   * Makes it possible to re-use tile manager for another scene, and avoids unnecessary increments
+   * of the tile-file-within-session index. */
+  ++write_state_.tile_file_index;
+
+  write_state_.filename = "";
 }
 
-bool TileManager::next()
+bool TileManager::read_full_buffer_from_disk(const string_view filename,
+                                             RenderBuffers *buffers,
+                                             DenoiseParams *denoise_params)
 {
-  if (done())
+  unique_ptr<ImageInput> in(ImageInput::open(filename));
+  if (!in) {
+    LOG(ERROR) << "Error opening tile file " << filename;
     return false;
+  }
+
+  const ImageSpec &image_spec = in->spec();
 
-  if (progressive && state.resolution_divider > pixel_size) {
-    state.sample = 0;
-    state.resolution_divider = max(state.resolution_divider / 2, pixel_size);
-    state.num_samples = 1;
-    set_tiles();
+  BufferParams buffer_params;
+  if (!buffer_params_from_image_spec_atttributes(&buffer_params, image_spec)) {
+    return false;
   }
-  else {
-    state.sample++;
+  buffers->reset(buffer_params);
 
-    if (progressive)
-      state.num_samples = 1;
-    else if (range_num_samples == -1)
-      state.num_samples = num_samples;
-    else
-      state.num_samples = range_num_samples;
+  if (!node_from_image_spec_atttributes(denoise_params, image_spec, ATTR_DENOISE_SOCKET_PREFIX)) {
+    return false;
+  }
 
-    state.resolution_divider = pixel_size;
+  if (!in->read_image(TypeDesc::FLOAT, buffers->buffer.data())) {
+    LOG(ERROR) << "Error reading pixels from the tile file " << in->geterror();
+    return false;
+  }
 
-    if (state.sample == range_start_sample) {
-      set_tiles();
-    }
-    else {
-      gen_render_tiles();
-    }
+  if (!in->close()) {
+    LOG(ERROR) << "Error closing tile file " << in->geterror();
+    return false;
   }
 
   return true;
 }
 
-int TileManager::get_num_effective_samples()
-{
-  return (range_num_samples == -1) ? num_samples : range_num_samples;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 790a56f9445..124d0b3652c 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -14,159 +14,151 @@
  * limitations under the License.
  */
 
-#ifndef __TILE_H__
-#define __TILE_H__
-
-#include <limits.h>
+#pragma once
 
 #include "render/buffers.h"
-#include "util/util_list.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Tile */
+class DenoiseParams;
+class Scene;
+
+/* --------------------------------------------------------------------
+ * Tile.
+ */
 
 class Tile {
  public:
-  int index;
-  int x, y, w, h;
-  int device;
-  /* RENDER: The tile has to be rendered.
-   * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
-   * DENOISE: The tile can be denoised now.
-   * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
-   * DONE: The tile is finished and has been freed. */
-  typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
-  State state;
-  RenderBuffers *buffers;
+  int x = 0, y = 0;
+  int width = 0, height = 0;
 
   Tile()
   {
   }
-
-  Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
-      : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL)
-  {
-  }
 };
 
-/* Tile order */
-
-/* Note: this should match enum_tile_order in properties.py */
-enum TileOrder {
-  TILE_CENTER = 0,
-  TILE_RIGHT_TO_LEFT = 1,
-  TILE_LEFT_TO_RIGHT = 2,
-  TILE_TOP_TO_BOTTOM = 3,
-  TILE_BOTTOM_TO_TOP = 4,
-  TILE_HILBERT_SPIRAL = 5,
-};
-
-/* Tile Manager */
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
 
 class TileManager {
  public:
-  BufferParams params;
-
-  struct State {
-    vector<Tile> tiles;
-    int tile_stride;
-    BufferParams buffer;
-    int sample;
-    int num_samples;
-    int resolution_divider;
-    int num_tiles;
-
-    /* Total samples over all pixels: Generally num_samples*num_pixels,
-     * but can be higher due to the initial resolution division for previews. */
-    uint64_t total_pixel_samples;
-
-    /* These lists contain the indices of the tiles to be rendered/denoised and are used
-     * when acquiring a new tile for the device.
-     * Each list in each vector is for one logical device. */
-    vector<list<int>> render_tiles;
-    vector<list<int>> denoising_tiles;
-  } state;
-
-  int num_samples;
-  int slice_overlap;
-
-  TileManager(bool progressive,
-              int num_samples,
-              int2 tile_size,
-              int start_resolution,
-              bool preserve_tile_device,
-              bool background,
-              TileOrder tile_order,
-              int num_devices = 1,
-              int pixel_size = 1);
+  /* This callback is invoked by whenever on-dist tiles storage file is closed after writing. */
+  function<void(string_view)> full_buffer_written_cb;
+
+  TileManager();
   ~TileManager();
 
-  void device_free();
-  void reset(BufferParams &params, int num_samples);
-  void set_samples(int num_samples);
+  TileManager(const TileManager &other) = delete;
+  TileManager(TileManager &&other) noexcept = delete;
+  TileManager &operator=(const TileManager &other) = delete;
+  TileManager &operator=(TileManager &&other) = delete;
+
+  /* Reset current progress and start new rendering of the full-frame parameters in tiles of the
+   * given size.
+   * Only touches scheduling-related state of the tile manager. */
+  /* TODO(sergey): Consider using tile area instead of exact size to help dealing with extreme
+   * cases of stretched renders. */
+  void reset_scheduling(const BufferParams &params, int2 tile_size);
+
+  /* Update for the known buffer passes and scene parameters.
+   * Will store all parameters needed for buffers access outside of the scene graph. */
+  void update(const BufferParams &params, const Scene *scene);
+
+  inline int get_num_tiles() const
+  {
+    return tile_state_.num_tiles;
+  }
+
+  inline bool has_multiple_tiles() const
+  {
+    return tile_state_.num_tiles > 1;
+  }
+
   bool next();
-  bool next_tile(Tile *&tile, int device, uint tile_types);
-  bool finish_tile(const int index, const bool need_denoise, bool &delete_tile);
   bool done();
-  bool has_tiles();
 
-  void set_tile_order(TileOrder tile_order_)
+  const Tile &get_current_tile() const;
+
+  /* Write render buffer of a tile to a file on disk.
+   *
+   * Opens file for write when first tile is written.
+   *
+   * Returns true on success. */
+  bool write_tile(const RenderBuffers &tile_buffers);
+
+  /* Inform the tile manager that no more tiles will be written to disk.
+   * The file will be considered final, all handles to it will be closed. */
+  void finish_write_tiles();
+
+  /* Check whether any tile ahs been written to disk. */
+  inline bool has_written_tiles() const
   {
-    tile_order = tile_order_;
+    return write_state_.num_tiles_written != 0;
   }
 
-  int get_neighbor_index(int index, int neighbor);
-  bool check_neighbor_state(int index, Tile::State state);
+  /* Read full frame render buffer from tiles file on disk.
+   *
+   * Returns true on success. */
+  bool read_full_buffer_from_disk(string_view filename,
+                                  RenderBuffers *buffers,
+                                  DenoiseParams *denoise_params);
 
-  /* ** Sample range rendering. ** */
+ protected:
+  /* Get tile configuration for its index.
+   * The tile index must be within [0, state_.tile_state_). */
+  Tile get_tile_for_index(int index) const;
 
-  /* Start sample in the range. */
-  int range_start_sample;
+  bool open_tile_output();
+  bool close_tile_output();
 
-  /* Number to samples in the rendering range. */
-  int range_num_samples;
+  /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or
+   * several sessions. */
+  string tile_file_unique_part_;
 
-  /* Get number of actual samples to render. */
-  int get_num_effective_samples();
+  int2 tile_size_ = make_int2(0, 0);
 
-  /* Schedule tiles for denoising after they've been rendered. */
-  bool schedule_denoising;
+  BufferParams buffer_params_;
 
- protected:
-  void set_tiles();
-
-  bool progressive;
-  int2 tile_size;
-  TileOrder tile_order;
-  int start_resolution;
-  int pixel_size;
-  int num_devices;
-
-  /* in some cases it is important that the same tile will be returned for the same
-   * device it was originally generated for (i.e. viewport rendering when buffer is
-   * allocating once for tile and then always used by it)
-   *
-   * in other cases any tile could be handled by any device (i.e. final rendering
-   * without progressive refine)
-   */
-  bool preserve_tile_device;
-
-  /* for background render tiles should exactly match render parts generated from
-   * blender side, which means image first gets split into tiles and then tiles are
-   * assigning to render devices
-   *
-   * however viewport rendering expects tiles to be allocated in a special way,
-   * meaning image is being sliced horizontally first and every device handles
-   * its own slice
-   */
-  bool background;
-
-  /* Generate tile list, return number of tiles. */
-  int gen_tiles(bool sliced);
-  void gen_render_tiles();
+  /* Tile scheduling state. */
+  struct {
+    int num_tiles_x = 0;
+    int num_tiles_y = 0;
+    int num_tiles = 0;
+
+    int next_tile_index;
+
+    Tile current_tile;
+  } tile_state_;
+
+  /* State of tiles writing to a file on disk. */
+  struct {
+    /* Index of a tile file used during the current session.
+     * This number is used for the file name construction, making it possible to render several
+     * scenes throughout duration of the session and keep all results available for later read
+     * access. */
+    int tile_file_index = 0;
+
+    string filename;
+
+    /* Specification of the tile image which corresponds to the buffer parameters.
+     * Contains channels configured according to the passes configuration in the path traces.
+     *
+     * Output images are saved using this specification, input images are expected to have matched
+     * specification. */
+    ImageSpec image_spec;
+
+    /* Output handle for the tile file.
+     *
+     * This file can not be closed until all tiles has been provided, so the handle is stored in
+     * the state and is created whenever writing is requested. */
+    unique_ptr<ImageOutput> tile_out;
+
+    int num_tiles_written = 0;
+  } write_state_;
 };
 
 CCL_NAMESPACE_END
-
-#endif /* __TILE_H__ */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 65a692acd03..0f6b435813f 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -32,6 +32,7 @@ set(INC
 set(ALL_CYCLES_LIBRARIES
   cycles_device
   cycles_kernel
+  cycles_integrator
   cycles_render
   cycles_bvh
   cycles_graph
@@ -45,8 +46,12 @@ include_directories(${INC})
 cycles_link_directories()
 
 set(SRC
+  integrator_adaptive_sampling_test.cpp
+  integrator_render_scheduler_test.cpp
+  integrator_tile_test.cpp
   render_graph_finalize_test.cpp
   util_aligned_malloc_test.cpp
+  util_math_test.cpp
   util_path_test.cpp
   util_string_test.cpp
   util_task_test.cpp
diff --git a/intern/cycles/test/integrator_adaptive_sampling_test.cpp b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
new file mode 100644
index 00000000000..3ed6a23125d
--- /dev/null
+++ b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/adaptive_sampling.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(AdaptiveSampling, schedule_samples)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 0;
+  adaptive_sampling.adaptive_step = 4;
+
+  for (int sample = 2; sample < 32; ++sample) {
+    for (int num_samples = 8; num_samples < 32; ++num_samples) {
+      const int num_samples_aligned = adaptive_sampling.align_samples(sample, num_samples);
+      /* NOTE: `sample + num_samples_aligned` is the number of samples after rendering, so need
+       * to convert this to the 0-based index of the last sample. */
+      EXPECT_TRUE(adaptive_sampling.need_filter(sample + num_samples_aligned - 1));
+    }
+  }
+}
+
+TEST(AdaptiveSampling, align_samples)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+  adaptive_sampling.adaptive_step = 4;
+
+  /* Filtering will happen at the following samples:
+   * 15, 19, 23, 27, 31, 35, 39, 43 */
+
+  /* Requested sample and number of samples will result in number of samples lower than
+   * `min_samples`. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 4), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 7), 7);
+
+  /* Request number of samples higher than the minimum samples before filter, but prior to the
+   * first sample at which filtering will happen. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 15), 15);
+
+  /* When rendering many samples from the very beginning, limit number of samples by the first
+   * sample at which filtering is to happen. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 16), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 17), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 20), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 60), 16);
+
+  /* Similar to above, but start sample is not 0. */
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 8), 7);
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 20), 7);
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 60), 7);
+
+  /* Start sample is past the minimum required samples, but prior to the first filter sample. */
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 6), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 20), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 60), 4);
+
+  /* Start sample is the sample which is to be filtered. */
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 4), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 6), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 10), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(58, 2), 2);
+
+  /* Start sample is past the sample which is to be filtered. */
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 3), 3);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 4), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 5), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 10), 4);
+
+  /* Should never exceed requested number of samples. */
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 2), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 2), 2);
+  EXPECT_EQ(adaptive_sampling.align_samples(17, 2), 2);
+  EXPECT_EQ(adaptive_sampling.align_samples(18, 2), 2);
+}
+
+TEST(AdaptiveSampling, need_filter)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+  adaptive_sampling.adaptive_step = 4;
+
+  const vector<int> expected_samples_to_filter = {
+      {15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59}};
+
+  vector<int> actual_samples_to_filter;
+  for (int sample = 0; sample < 60; ++sample) {
+    if (adaptive_sampling.need_filter(sample)) {
+      actual_samples_to_filter.push_back(sample);
+    }
+  }
+
+  EXPECT_EQ(actual_samples_to_filter, expected_samples_to_filter);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_render_scheduler_test.cpp b/intern/cycles/test/integrator_render_scheduler_test.cpp
new file mode 100644
index 00000000000..b4efbc2d1a7
--- /dev/null
+++ b/intern/cycles/test/integrator_render_scheduler_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/render_scheduler.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(IntegratorRenderScheduler, calculate_resolution_divider_for_resolution)
+{
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 1920), 1);
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 960), 2);
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 480), 4);
+}
+
+TEST(IntegratorRenderScheduler, calculate_resolution_for_divider)
+{
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 1), 1440);
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 2), 720);
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 4), 360);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp
new file mode 100644
index 00000000000..5bb57b48c3c
--- /dev/null
+++ b/intern/cycles/test/integrator_tile_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/tile.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(tile_calculate_best_size, Basic)
+{
+  /* Make sure CPU-like case is handled properly. */
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1), TileSize(1, 1, 1));
+
+  /* Enough path states to fit an entire image with all samples. */
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080),
+            TileSize(1920, 1080, 1));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100),
+            TileSize(1920, 1080, 100));
+}
+
+TEST(tile_calculate_best_size, Extreme)
+{
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072), TileSize(1, 1, 512));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072), TileSize(1, 1, 1024));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072), TileSize(1, 1, 4096));
+
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024),
+            TileSize(1, 1, 1024));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index da9b29314a7..19c211fe5f7 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -181,7 +181,7 @@ class RenderGraph : public testing::Test {
     util_logging_start();
     util_logging_verbosity_set(1);
 
-    device_cpu = Device::create(device_info, stats, profiler, true);
+    device_cpu = Device::create(device_info, stats, profiler);
     scene = new Scene(scene_params, device_cpu);
   }
 
diff --git a/intern/cycles/test/util_math_test.cpp b/intern/cycles/test/util_math_test.cpp
new file mode 100644
index 00000000000..b6ce3ef0cf3
--- /dev/null
+++ b/intern/cycles/test/util_math_test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(math, next_power_of_two)
+{
+  EXPECT_EQ(next_power_of_two(0), 1);
+  EXPECT_EQ(next_power_of_two(1), 2);
+  EXPECT_EQ(next_power_of_two(2), 4);
+  EXPECT_EQ(next_power_of_two(3), 4);
+  EXPECT_EQ(next_power_of_two(4), 8);
+}
+
+TEST(math, prev_power_of_two)
+{
+  EXPECT_EQ(prev_power_of_two(0), 0);
+
+  EXPECT_EQ(prev_power_of_two(1), 1);
+  EXPECT_EQ(prev_power_of_two(2), 1);
+
+  EXPECT_EQ(prev_power_of_two(3), 2);
+  EXPECT_EQ(prev_power_of_two(4), 2);
+
+  EXPECT_EQ(prev_power_of_two(5), 4);
+  EXPECT_EQ(prev_power_of_two(6), 4);
+  EXPECT_EQ(prev_power_of_two(7), 4);
+  EXPECT_EQ(prev_power_of_two(8), 4);
+}
+
+TEST(math, reverse_integer_bits)
+{
+  EXPECT_EQ(reverse_integer_bits(0xFFFFFFFF), 0xFFFFFFFF);
+  EXPECT_EQ(reverse_integer_bits(0x00000000), 0x00000000);
+  EXPECT_EQ(reverse_integer_bits(0x1), 0x80000000);
+  EXPECT_EQ(reverse_integer_bits(0x80000000), 0x1);
+  EXPECT_EQ(reverse_integer_bits(0xFFFF0000), 0x0000FFFF);
+  EXPECT_EQ(reverse_integer_bits(0x0000FFFF), 0xFFFF0000);
+  EXPECT_EQ(reverse_integer_bits(0x00FF0000), 0x0000FF00);
+  EXPECT_EQ(reverse_integer_bits(0x0000FF00), 0x00FF0000);
+  EXPECT_EQ(reverse_integer_bits(0xAAAAAAAA), 0x55555555);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 97f8daa65de..c9022d1b132 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -281,4 +281,40 @@ TEST(util_string_remove_trademark, r_space_middle)
   EXPECT_EQ(str, "foo bar baz");
 }
 
+/* ******** Tests for string_startswith() ******** */
+
+TEST(string_startswith, basic)
+{
+  EXPECT_TRUE(string_startswith("", ""));
+
+  EXPECT_FALSE(string_startswith("", "World"));
+  EXPECT_TRUE(string_startswith("Hello", ""));
+
+  EXPECT_FALSE(string_startswith("Hello", "World"));
+
+  EXPECT_TRUE(string_startswith("Hello", "Hello"));
+  EXPECT_TRUE(string_startswith("Hello", "He"));
+  EXPECT_TRUE(string_startswith("Hello", "H"));
+
+  EXPECT_FALSE(string_startswith("Hello", "e"));
+  EXPECT_FALSE(string_startswith("Hello", "HelloWorld"));
+}
+
+TEST(string_endswith, basic)
+{
+  EXPECT_TRUE(string_endswith("", ""));
+
+  EXPECT_FALSE(string_endswith("", "World"));
+  EXPECT_TRUE(string_endswith("Hello", ""));
+
+  EXPECT_FALSE(string_endswith("Hello", "World"));
+
+  EXPECT_TRUE(string_endswith("Hello", "Hello"));
+  EXPECT_TRUE(string_endswith("Hello", "lo"));
+  EXPECT_TRUE(string_endswith("Hello", "o"));
+
+  EXPECT_FALSE(string_endswith("Hello", "e"));
+  EXPECT_FALSE(string_endswith("Hello", "WorldHello"));
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 13d177d2b25..de17efafcf2 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,56 +34,6 @@
 
 #else /* __KERNEL_GPU__ */
 
-#  ifdef __KERNEL_OPENCL__
-
-/* Float atomics implementation credits:
- *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
- */
-ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
-                                                   const float operand)
-{
-  union {
-    unsigned int int_value;
-    float float_value;
-  } new_value;
-  union {
-    unsigned int int_value;
-    float float_value;
-  } prev_value;
-  do {
-    prev_value.float_value = *source;
-    new_value.float_value = prev_value.float_value + operand;
-  } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source,
-                          prev_value.int_value,
-                          new_value.int_value) != prev_value.int_value);
-  return new_value.float_value;
-}
-
-ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
-                                                      const float old_val,
-                                                      const float new_val)
-{
-  union {
-    unsigned int int_value;
-    float float_value;
-  } new_value, prev_value, result;
-  prev_value.float_value = old_val;
-  new_value.float_value = new_val;
-  result.int_value = atomic_cmpxchg(
-      (volatile ccl_global unsigned int *)dest, prev_value.int_value, new_value.int_value);
-  return result.float_value;
-}
-
-#    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
-#    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
-#    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
-#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
-
-#    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
-#    define ccl_barrier(flags) barrier(flags)
-
-#  endif /* __KERNEL_OPENCL__ */
-
 #  ifdef __KERNEL_CUDA__
 
 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 74ecefa1917..1d598725c84 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -26,13 +26,7 @@
 CCL_NAMESPACE_BEGIN
 
 DebugFlags::CPU::CPU()
-    : avx2(true),
-      avx(true),
-      sse41(true),
-      sse3(true),
-      sse2(true),
-      bvh_layout(BVH_LAYOUT_AUTO),
-      split_kernel(false)
+    : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO)
 {
   reset();
 }
@@ -58,11 +52,9 @@ void DebugFlags::CPU::reset()
 #undef CHECK_CPU_FLAGS
 
   bvh_layout = BVH_LAYOUT_AUTO;
-
-  split_kernel = false;
 }
 
-DebugFlags::CUDA::CUDA() : adaptive_compile(false), split_kernel(false)
+DebugFlags::CUDA::CUDA() : adaptive_compile(false)
 {
   reset();
 }
@@ -71,8 +63,6 @@ void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
     adaptive_compile = true;
-
-  split_kernel = false;
 }
 
 DebugFlags::OptiX::OptiX()
@@ -82,42 +72,7 @@ DebugFlags::OptiX::OptiX()
 
 void DebugFlags::OptiX::reset()
 {
-  cuda_streams = 1;
-  curves_api = false;
-}
-
-DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
-{
-  reset();
-}
-
-void DebugFlags::OpenCL::reset()
-{
-  /* Initialize device type from environment variables. */
-  device_type = DebugFlags::OpenCL::DEVICE_ALL;
-  char *device = getenv("CYCLES_OPENCL_TEST");
-  if (device) {
-    if (strcmp(device, "NONE") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_NONE;
-    }
-    else if (strcmp(device, "ALL") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_ALL;
-    }
-    else if (strcmp(device, "DEFAULT") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
-    }
-    else if (strcmp(device, "CPU") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_CPU;
-    }
-    else if (strcmp(device, "GPU") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_GPU;
-    }
-    else if (strcmp(device, "ACCELERATOR") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
-    }
-  }
-  /* Initialize other flags from environment variables. */
-  debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+  use_debug = false;
 }
 
 DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
@@ -131,7 +86,6 @@ void DebugFlags::reset()
   cpu.reset();
   cuda.reset();
   optix.reset();
-  opencl.reset();
 }
 
 std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
@@ -142,40 +96,13 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
      << "  SSE4.1     : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
      << "  SSE3       : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
      << "  SSE2       : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
-     << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
-     << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
+     << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n";
 
   os << "CUDA flags:\n"
      << "  Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
 
   os << "OptiX flags:\n"
-     << "  CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
-
-  const char *opencl_device_type;
-  switch (debug_flags.opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      opencl_device_type = "NONE";
-      break;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      opencl_device_type = "ALL";
-      break;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      opencl_device_type = "DEFAULT";
-      break;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      opencl_device_type = "CPU";
-      break;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      opencl_device_type = "GPU";
-      break;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      opencl_device_type = "ACCELERATOR";
-      break;
-  }
-  os << "OpenCL flags:\n"
-     << "  Device type    : " << opencl_device_type << "\n"
-     << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
-     << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
+     << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
   return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index f7e53f90f74..99e2723180c 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -79,9 +79,6 @@ class DebugFlags {
      * CPUs and GPUs can be selected here instead.
      */
     BVHLayout bvh_layout;
-
-    /* Whether split kernel is used */
-    bool split_kernel;
   };
 
   /* Descriptor of CUDA feature-set to be used. */
@@ -94,9 +91,6 @@ class DebugFlags {
     /* Whether adaptive feature based runtime compile is enabled or not.
      * Requires the CUDA Toolkit and only works on Linux atm. */
     bool adaptive_compile;
-
-    /* Whether split kernel is used */
-    bool split_kernel;
   };
 
   /* Descriptor of OptiX feature-set to be used. */
@@ -106,61 +100,9 @@ class DebugFlags {
     /* Reset flags to their defaults. */
     void reset();
 
-    /* Number of CUDA streams to launch kernels concurrently from. */
-    int cuda_streams;
-
-    /* Use OptiX curves API for hair instead of custom implementation. */
-    bool curves_api;
-  };
-
-  /* Descriptor of OpenCL feature-set to be used. */
-  struct OpenCL {
-    OpenCL();
-
-    /* Reset flags to their defaults. */
-    void reset();
-
-    /* Available device types.
-     * Only gives a hint which devices to let user to choose from, does not
-     * try to use any sort of optimal device or so.
-     */
-    enum DeviceType {
-      /* None of OpenCL devices will be used. */
-      DEVICE_NONE,
-      /* All OpenCL devices will be used. */
-      DEVICE_ALL,
-      /* Default system OpenCL device will be used. */
-      DEVICE_DEFAULT,
-      /* Host processor will be used. */
-      DEVICE_CPU,
-      /* GPU devices will be used. */
-      DEVICE_GPU,
-      /* Dedicated OpenCL accelerator device will be used. */
-      DEVICE_ACCELERATOR,
-    };
-
-    /* Available kernel types. */
-    enum KernelType {
-      /* Do automated guess which kernel to use, based on the officially
-       * supported GPUs and such.
-       */
-      KERNEL_DEFAULT,
-      /* Force mega kernel to be used. */
-      KERNEL_MEGA,
-      /* Force split kernel to be used. */
-      KERNEL_SPLIT,
-    };
-
-    /* Requested device type. */
-    DeviceType device_type;
-
-    /* Use debug version of the kernel. */
-    bool debug;
-
-    /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all
-     * devices. */
-    /* Artificial memory limit in bytes (0 if disabled). */
-    size_t mem_limit;
+    /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
+     * validations, and lower optimization level. */
+    bool use_debug;
   };
 
   /* Get instance of debug flags registry. */
@@ -182,9 +124,6 @@ class DebugFlags {
   /* Requested OptiX flags. */
   OptiX optix;
 
-  /* Requested OpenCL flags. */
-  OpenCL opencl;
-
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 0a239a944a5..9b1698d461a 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -43,9 +43,9 @@
 #  define ccl_local_param
 #  define ccl_private
 #  define ccl_restrict __restrict
-#  define ccl_ref &
 #  define ccl_optional_struct_init
 #  define ccl_loop_no_unroll
+#  define ccl_attr_maybe_unused [[maybe_unused]]
 #  define __KERNEL_WITH_SSE_ALIGN__
 
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
@@ -62,7 +62,6 @@
 #    define ccl_may_alias
 #    define ccl_always_inline __forceinline
 #    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
 #  else /* _WIN32 && !FREE_WINDOWS */
 #    define ccl_device_inline static inline __attribute__((always_inline))
 #    define ccl_device_forceinline static inline __attribute__((always_inline))
@@ -74,7 +73,6 @@
 #    define ccl_may_alias __attribute__((__may_alias__))
 #    define ccl_always_inline __attribute__((always_inline))
 #    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
 #  endif /* _WIN32 && !FREE_WINDOWS */
 
 /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index a8d4ee75e20..d9edfec5da3 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -28,14 +28,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Half Floats */
 
-#ifdef __KERNEL_OPENCL__
-
-#  define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h);
-
-#else
-
 /* CUDA has its own half data type, no need to define then */
-#  ifndef __KERNEL_CUDA__
+#ifndef __KERNEL_CUDA__
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -59,27 +53,27 @@ class half {
  private:
   unsigned short v;
 };
-#  endif
+#endif
 
 struct half4 {
   half x, y, z, w;
 };
 
-#  ifdef __KERNEL_CUDA__
+#ifdef __KERNEL_CUDA__
 
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
 {
-  h[0] = __float2half(f.x * scale);
-  h[1] = __float2half(f.y * scale);
-  h[2] = __float2half(f.z * scale);
-  h[3] = __float2half(f.w * scale);
+  h[0] = __float2half(f.x);
+  h[1] = __float2half(f.y);
+  h[2] = __float2half(f.z);
+  h[3] = __float2half(f.w);
 }
 
-#  else
+#else
 
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
 {
-#    ifndef __KERNEL_SSE2__
+#  ifndef __KERNEL_SSE2__
   for (int i = 0; i < 4; i++) {
     /* optimized float to half for pixels:
      * assumes no negative, no nan, no inf, and sets denormal to 0 */
@@ -87,8 +81,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
       uint i;
       float f;
     } in;
-    float fscale = f[i] * scale;
-    in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f;
+    in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f;
     int x = in.i;
 
     int absolute = x & 0x7FFFFFFF;
@@ -98,23 +91,22 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 
     h[i] = (rshift & 0x7FFF);
   }
-#    else
+#  else
   /* same as above with SSE */
-  ssef fscale = load4f(f) * scale;
-  ssef x = min(max(fscale, 0.0f), 65504.0f);
+  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
 
-#      ifdef __KERNEL_AVX2__
+#    ifdef __KERNEL_AVX2__
   ssei rpack = _mm_cvtps_ph(x, 0);
-#      else
+#    else
   ssei absolute = cast(x) & 0x7FFFFFFF;
   ssei Z = absolute + 0xC8000000;
   ssei result = andnot(absolute < 0x38800000, Z);
   ssei rshift = (result >> 13) & 0x7FFF;
   ssei rpack = _mm_packs_epi32(rshift, rshift);
-#      endif
+#    endif
 
   _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack));
-#    endif
+#  endif
 }
 
 ccl_device_inline float half_to_float(half h)
@@ -160,8 +152,6 @@ ccl_device_inline half float_to_half(float f)
   return (value_bits | sign_bit);
 }
 
-#  endif
-
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index c161299acd0..35c2d436d09 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -49,6 +49,7 @@ class LogMessageVoidify {
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
 #  define VLOG_IF(severity, condition) LOG_SUPPRESS()
+#  define VLOG_IS_ON(severity) false
 
 #  define CHECK(expression) LOG_SUPPRESS()
 
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c5996ebfcb6..6d728dde679 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,11 +26,9 @@
 #  include <cmath>
 #endif
 
-#ifndef __KERNEL_OPENCL__
-#  include <float.h>
-#  include <math.h>
-#  include <stdio.h>
-#endif /* __KERNEL_OPENCL__ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
 
 #include "util/util_types.h"
 
@@ -86,7 +84,6 @@ CCL_NAMESPACE_BEGIN
 /* Scalar */
 
 #ifdef _WIN32
-#  ifndef __KERNEL_OPENCL__
 ccl_device_inline float fmaxf(float a, float b)
 {
   return (a > b) ? a : b;
@@ -96,8 +93,7 @@ ccl_device_inline float fminf(float a, float b)
 {
   return (a < b) ? a : b;
 }
-#  endif /* !__KERNEL_OPENCL__ */
-#endif   /* _WIN32 */
+#endif /* _WIN32 */
 
 #ifndef __KERNEL_GPU__
 using std::isfinite;
@@ -119,6 +115,11 @@ ccl_device_inline int min(int a, int b)
   return (a < b) ? a : b;
 }
 
+ccl_device_inline uint min(uint a, uint b)
+{
+  return (a < b) ? a : b;
+}
+
 ccl_device_inline float max(float a, float b)
 {
   return (a > b) ? a : b;
@@ -166,7 +167,6 @@ ccl_device_inline float max4(float a, float b, float c, float d)
   return max(max(a, b), max(c, d));
 }
 
-#ifndef __KERNEL_OPENCL__
 /* Int/Float conversion */
 
 ccl_device_inline int as_int(uint i)
@@ -241,24 +241,23 @@ ccl_device_inline float __uint_as_float(uint i)
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(f.m128));
-#  else
+#else
   return make_int4(
       __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 __int4_as_float4(int4 i)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_castsi128_ps(i.m128));
-#  else
+#else
   return make_float4(
       __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#  endif
+#endif
 }
-#endif /* __KERNEL_OPENCL__ */
 
 /* Versions of functions which are safe for fast math. */
 ccl_device_inline bool isnan_safe(float f)
@@ -279,7 +278,6 @@ ccl_device_inline float ensure_finite(float v)
   return isfinite_safe(v) ? v : 0.0f;
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
   return min(max(a, mn), mx);
@@ -309,8 +307,6 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
   return result;
 }
 
-#endif /* __KERNEL_OPENCL__ */
-
 #ifndef __KERNEL_CUDA__
 ccl_device_inline float saturate(float a)
 {
@@ -451,7 +447,6 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_OPENCL__
 /* Interpolation */
 
 template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -459,15 +454,9 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
   return (A)(a * ((B)1 - t) + b * t);
 }
 
-#endif /* __KERNEL_OPENCL__ */
-
 /* Triangle */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float triangle_area(const float3 &v1, const float3 &v2, const float3 &v3)
-#else
-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
-#endif
 {
   return len(cross(v3 - v2, v1 - v2)) * 0.5f;
 }
@@ -665,11 +654,7 @@ ccl_device_inline float pow22(float a)
 
 ccl_device_inline float beta(float x, float y)
 {
-#ifndef __KERNEL_OPENCL__
   return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
-#else
-  return expf(lgamma(x) + lgamma(y) - lgamma(x + y));
-#endif
 }
 
 ccl_device_inline float xor_signmask(float x, int y)
@@ -686,8 +671,6 @@ ccl_device_inline uint count_leading_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return __clz(x);
-#elif defined(__KERNEL_OPENCL__)
-  return clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -704,8 +687,6 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return (__ffs(x) - 1);
-#elif defined(__KERNEL_OPENCL__)
-  return (31 - count_leading_zeros(x & -x));
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -722,8 +703,6 @@ ccl_device_inline uint find_first_set(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return __ffs(x);
-#elif defined(__KERNEL_OPENCL__)
-  return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
 #else
 #  ifdef _MSC_VER
   return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -797,6 +776,52 @@ ccl_device_inline float precise_angle(float3 a, float3 b)
   return 2.0f * atan2f(len(a - b), len(a + b));
 }
 
+/* Return value which is greater than the given one and is a power of two. */
+ccl_device_inline uint next_power_of_two(uint x)
+{
+  return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
+}
+
+/* Return value which is lower than the given one and is a power of two. */
+ccl_device_inline uint prev_power_of_two(uint x)
+{
+  return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
+}
+
+#ifndef __has_builtin
+#  define __has_builtin(v) 0
+#endif
+
+/* Reverses the bits of a 32 bit integer. */
+ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
+{
+  /* Use a native instruction if it exists. */
+#if defined(__arm__) || defined(__aarch64__)
+  __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
+  return x;
+#elif defined(__KERNEL_CUDA__)
+  return __brev(x);
+#elif __has_builtin(__builtin_bitreverse32)
+  return __builtin_bitreverse32(x);
+#else
+  /* Flip pairwise. */
+  x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
+  /* Flip pairs. */
+  x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
+  /* Flip nibbles. */
+  x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
+  /* Flip bytes. CPUs have an instruction for that, pretty fast one. */
+#  ifdef _MSC_VER
+  return _byteswap_ulong(x);
+#  elif defined(__INTEL_COMPILER)
+  return (uint32_t)_bswap((int)x);
+#  else
+  /* Assuming gcc or clang. */
+  return __builtin_bswap32(x);
+#  endif
+#endif
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 17f6f3c9382..70b80c33544 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float2 operator-(const float2 &a);
 ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -64,7 +63,6 @@ ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
 ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
 
@@ -82,7 +80,6 @@ ccl_device_inline float2 one_float2()
   return make_float2(1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float2 operator-(const float2 &a)
 {
   return make_float2(-a.x, -a.y);
@@ -262,8 +259,6 @@ ccl_device_inline float2 floor(const float2 &a)
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
-#endif /* !__KERNEL_OPENCL__ */
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 9673c043189..30a1b4c3f77 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float3 operator-(const float3 &a);
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -63,7 +62,6 @@ ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
 ccl_device_inline float3 floor(const float3 &a);
 ccl_device_inline float3 ceil(const float3 &a);
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a);
 ccl_device_inline float max3(float3 a);
@@ -105,50 +103,49 @@ ccl_device_inline float3 one_float3()
   return make_float3(1.0f, 1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float3 operator-(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#  else
+#else
   return make_float3(-a.x, -a.y, -a.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float f)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#  else
+#else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float f, const float3 &a)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#  else
+#else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator/(const float f, const float3 &a)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#  else
+#else
   return make_float3(f / a.x, f / a.y, f / a.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -159,11 +156,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -173,11 +170,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
 
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -187,11 +184,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
 
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -227,11 +224,11 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 
 ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#  else
+#else
   return (a.x == b.x && a.y == b.y && a.z == b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -246,20 +243,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
 
 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#  else
+#else
   return a.x * b.x + a.y * b.y + a.z * b.z;
-#  endif
+#endif
 }
 
 ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
   return a.x * b.x + a.y * b.y;
-#  endif
+#endif
 }
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -270,30 +267,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 normalize(const float3 &a)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
   return float3(_mm_div_ps(a.m128, norm));
-#  else
+#else
   return a / len(a);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 min(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 max(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -303,43 +300,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 
 ccl_device_inline float3 fabs(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_NEON__
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
   return float3(vabsq_f32(a.m128));
-#    else
+#  else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
-#    endif
-#  else
-  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
 #  endif
+#else
+  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
 }
 
 ccl_device_inline float3 sqrt(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
-#  else
+#else
   return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 floor(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
-#  else
+#else
   return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 ceil(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
-#  else
+#else
   return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -349,14 +346,13 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
 
 ccl_device_inline float3 rcp(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#  else
+#else
   return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#  endif
+#endif
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a)
 {
@@ -483,11 +479,7 @@ ccl_device_inline float average(const float3 a)
 
 ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 {
-#ifdef __KERNEL_OPENCL__
-  return all(a == b);
-#else
   return a == b;
-#endif
 }
 
 ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 0ba2bafa2f0..19af5c8c638 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float4 operator-(const float4 &a);
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -66,7 +65,6 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 ccl_device_inline float4 fabs(const float4 &a);
 ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_OPENCL__*/
 
 ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
 
@@ -112,33 +110,32 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float4 operator-(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
   return float4(_mm_xor_ps(a.m128, mask));
-#  else
+#else
   return make_float4(-a.x, -a.y, -a.z, -a.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, float f)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return a * make_float4(f);
-#  else
+#else
   return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -153,11 +150,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
 
 ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -167,11 +164,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
 
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -181,11 +178,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
 
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -215,38 +212,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
 
 ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#  else
+#else
   return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -256,16 +253,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   __m128 t = vmulq_f32(a, b);
   return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
 #  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
 #  endif
+#else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#endif
 }
 
 ccl_device_inline float len_squared(const float4 &a)
@@ -275,21 +272,21 @@ ccl_device_inline float len_squared(const float4 &a)
 
 ccl_device_inline float4 rcp(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#  else
+#else
   return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 sqrt(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_sqrt_ps(a.m128));
-#  else
+#else
   return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 sqr(const float4 &a)
@@ -299,39 +296,39 @@ ccl_device_inline float4 sqr(const float4 &a)
 
 ccl_device_inline float4 cross(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
          (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#  else
+#else
   return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#  endif
+#endif
 }
 
 ccl_device_inline bool is_zero(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return a == make_float4(0.0f);
-#  else
+#else
   return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   return float4(vdupq_n_f32(vaddvq_f32(a)));
-#    elif defined(__KERNEL_SSE3__)
+#  elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
-#    else
+#  else
   float4 h(shuffle<1, 0, 3, 2>(a) + a);
   return shuffle<2, 3, 0, 1>(h) + h;
-#    endif
-#  else
+#  endif
+#else
   float sum = (a.x + a.y) + (a.z + a.w);
   return make_float4(sum, sum, sum, sum);
-#  endif
+#endif
 }
 
 ccl_device_inline float average(const float4 &a)
@@ -357,20 +354,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
 
 ccl_device_inline float4 min(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_min_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 max(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_max_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -380,24 +377,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
 #  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
 #  endif
+#else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#endif
 }
 
 ccl_device_inline float4 floor(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_floor_ps(a));
-#  else
+#else
   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -405,8 +402,6 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
-#endif /* !__KERNEL_OPENCL__*/
-
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
index 0295cd51f7e..5782b878801 100644
--- a/intern/cycles/util/util_math_int2.h
+++ b/intern/cycles/util/util_math_int2.h
@@ -27,20 +27,17 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline bool operator==(const int2 a, const int2 b);
 ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
 ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_OPENCL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
   return (a.x == b.x && a.y == b.y);
@@ -70,7 +67,6 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 {
   return make_int2(a.x / b.x, a.y / b.y);
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
index d92ed895dc2..e0dfae7c015 100644
--- a/intern/cycles/util/util_math_int3.h
+++ b/intern/cycles/util/util_math_int3.h
@@ -27,52 +27,49 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int3 min(int3 a, int3 b);
 ccl_device_inline int3 max(int3 a, int3 b);
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /* !__KERNEL_OPENCL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_min_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_max_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
-#  else
+#else
   return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
-#  else
+#else
   return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -92,22 +89,21 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
 
 ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#  endif
+#endif
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 8905c8bc7f0..c78f4615013 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -66,6 +66,7 @@ typedef struct stat path_stat_t;
 
 static string cached_path = "";
 static string cached_user_path = "";
+static string cached_temp_path = "";
 static string cached_xdg_cache_path = "";
 
 namespace {
@@ -335,10 +336,11 @@ static string path_xdg_cache_get()
 }
 #endif
 
-void path_init(const string &path, const string &user_path)
+void path_init(const string &path, const string &user_path, const string &temp_path)
 {
   cached_path = path;
   cached_user_path = user_path;
+  cached_temp_path = temp_path;
 
 #ifdef _MSC_VER
   // workaround for https://svn.boost.org/trac/boost/ticket/6320
@@ -382,6 +384,15 @@ string path_cache_get(const string &sub)
 #endif
 }
 
+string path_temp_get(const string &sub)
+{
+  if (cached_temp_path == "") {
+    cached_temp_path = Filesystem::temp_directory_path();
+  }
+
+  return path_join(cached_temp_path, sub);
+}
+
 #if defined(__linux__) || defined(__APPLE__)
 string path_xdg_home_get(const string &sub = "");
 #endif
@@ -739,177 +750,6 @@ bool path_remove(const string &path)
   return remove(path.c_str()) == 0;
 }
 
-struct SourceReplaceState {
-  typedef map<string, string> ProcessedMapping;
-  /* Base director for all relative include headers. */
-  string base;
-  /* Result of processed files. */
-  ProcessedMapping processed_files;
-  /* Set of files which are considered "precompiled" and which are replaced
-   * with and empty string on a subsequent occurrence in include statement.
-   */
-  set<string> precompiled_headers;
-};
-
-static string path_source_replace_includes_recursive(const string &source,
-                                                     const string &source_filepath,
-                                                     SourceReplaceState *state);
-
-static string line_directive(const SourceReplaceState &state, const string &path, const int line)
-{
-  string unescaped_path = path;
-  /* First we make path relative. */
-  if (string_startswith(unescaped_path, state.base.c_str())) {
-    const string base_file = path_filename(state.base);
-    const size_t base_len = state.base.length();
-    unescaped_path = base_file +
-                     unescaped_path.substr(base_len, unescaped_path.length() - base_len);
-  }
-  /* Second, we replace all unsafe characters. */
-  const size_t length = unescaped_path.length();
-  string escaped_path = "";
-  for (size_t i = 0; i < length; ++i) {
-    const char ch = unescaped_path[i];
-    if (strchr("\"\'\?\\", ch) != NULL) {
-      escaped_path += "\\";
-    }
-    escaped_path += ch;
-  }
-  /* TODO(sergey): Check whether using std::to_string combined with several
-   * concatenation operations is any faster.
-   */
-  return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
-}
-
-static string path_source_handle_preprocessor(const string &preprocessor_line,
-                                              const string &source_filepath,
-                                              const size_t line_number,
-                                              SourceReplaceState *state)
-{
-  string result = preprocessor_line;
-  string token = string_strip(preprocessor_line.substr(1, preprocessor_line.size() - 1));
-  if (string_startswith(token, "include")) {
-    token = string_strip(token.substr(7, token.size() - 7));
-    if (token[0] == '"') {
-      const size_t n_start = 1;
-      const size_t n_end = token.find("\"", n_start);
-      const string filename = token.substr(n_start, n_end - n_start);
-      const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
-      string filepath = path_join(state->base, filename);
-      if (!path_exists(filepath)) {
-        filepath = path_join(path_dirname(source_filepath), filename);
-      }
-      if (is_precompiled) {
-        state->precompiled_headers.insert(filepath);
-      }
-      string text;
-      if (path_read_text(filepath, text)) {
-        text = path_source_replace_includes_recursive(text, filepath, state);
-        /* Use line directives for better error messages. */
-        result = line_directive(*state, filepath, 1) + "\n" + text + "\n" +
-                 line_directive(*state, source_filepath, line_number + 1);
-      }
-    }
-  }
-  return result;
-}
-
-/* Our own little c preprocessor that replaces #includes with the file
- * contents, to work around issue of OpenCL drivers not supporting
- * include paths with spaces in them.
- */
-static string path_source_replace_includes_recursive(const string &source,
-                                                     const string &source_filepath,
-                                                     SourceReplaceState *state)
-{
-  /* Try to re-use processed file without spending time on replacing all
-   * include directives again.
-   */
-  SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find(
-      source_filepath);
-  if (replaced_file != state->processed_files.end()) {
-    if (state->precompiled_headers.find(source_filepath) != state->precompiled_headers.end()) {
-      return "";
-    }
-    return replaced_file->second;
-  }
-  /* Perform full file processing. */
-  string result = "";
-  const size_t source_length = source.length();
-  size_t index = 0;
-  /* Information about where we are in the source. */
-  size_t line_number = 0, column_number = 1;
-  /* Currently gathered non-preprocessor token.
-   * Store as start/length rather than token itself to avoid overhead of
-   * memory re-allocations on each character concatenation.
-   */
-  size_t token_start = 0, token_length = 0;
-  /* Denotes whether we're inside of preprocessor line, together with
-   * preprocessor line itself.
-   *
-   * TODO(sergey): Investigate whether using token start/end position
-   * gives measurable speedup.
-   */
-  bool inside_preprocessor = false;
-  string preprocessor_line = "";
-  /* Actual loop over the whole source. */
-  while (index < source_length) {
-    const char ch = source[index];
-    if (ch == '\n') {
-      if (inside_preprocessor) {
-        result += path_source_handle_preprocessor(
-            preprocessor_line, source_filepath, line_number, state);
-        /* Start gathering net part of the token. */
-        token_start = index;
-        token_length = 0;
-      }
-      inside_preprocessor = false;
-      preprocessor_line = "";
-      column_number = 0;
-      ++line_number;
-    }
-    else if (ch == '#' && column_number == 1 && !inside_preprocessor) {
-      /* Append all possible non-preprocessor token to the result. */
-      if (token_length != 0) {
-        result.append(source, token_start, token_length);
-        token_start = index;
-        token_length = 0;
-      }
-      inside_preprocessor = true;
-    }
-    if (inside_preprocessor) {
-      preprocessor_line += ch;
-    }
-    else {
-      ++token_length;
-    }
-    ++index;
-    ++column_number;
-  }
-  /* Append possible tokens which happened before special events handled
-   * above.
-   */
-  if (token_length != 0) {
-    result.append(source, token_start, token_length);
-  }
-  if (inside_preprocessor) {
-    result += path_source_handle_preprocessor(
-        preprocessor_line, source_filepath, line_number, state);
-  }
-  /* Store result for further reuse. */
-  state->processed_files[source_filepath] = result;
-  return result;
-}
-
-string path_source_replace_includes(const string &source,
-                                    const string &path,
-                                    const string &source_filename)
-{
-  SourceReplaceState state;
-  state.base = path;
-  return path_source_replace_includes_recursive(source, path_join(path, source_filename), &state);
-}
-
 FILE *path_fopen(const string &path, const string &mode)
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 7a83c2135a4..f899bc2e01c 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -32,9 +32,10 @@
 CCL_NAMESPACE_BEGIN
 
 /* program paths */
-void path_init(const string &path = "", const string &user_path = "");
+void path_init(const string &path = "", const string &user_path = "", const string &tmp_path = "");
 string path_get(const string &sub = "");
 string path_user_get(const string &sub = "");
+string path_temp_get(const string &sub = "");
 string path_cache_get(const string &sub = "");
 
 /* path string manipulation */
@@ -65,11 +66,6 @@ bool path_read_text(const string &path, string &text);
 /* File manipulation. */
 bool path_remove(const string &path);
 
-/* source code utility */
-string path_source_replace_includes(const string &source,
-                                    const string &path,
-                                    const string &source_filename = "");
-
 /* cache utility */
 void path_cache_clear_except(const string &name, const set<string> &except);
 
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
index 073b09f719f..5343f076e22 100644
--- a/intern/cycles/util/util_profiling.cpp
+++ b/intern/cycles/util/util_profiling.cpp
@@ -48,13 +48,7 @@ void Profiler::run()
       }
 
       if (cur_shader >= 0 && cur_shader < shader_samples.size()) {
-        /* Only consider the active shader during events whose runtime significantly depends on it.
-         */
-        if (((cur_event >= PROFILING_SHADER_EVAL) && (cur_event <= PROFILING_SUBSURFACE)) ||
-            ((cur_event >= PROFILING_CLOSURE_EVAL) &&
-             (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
-          shader_samples[cur_shader]++;
-        }
+        shader_samples[cur_shader]++;
       }
 
       if (cur_object >= 0 && cur_object < object_samples.size()) {
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
index ceec08ed894..96bb682c50e 100644
--- a/intern/cycles/util/util_profiling.h
+++ b/intern/cycles/util/util_profiling.h
@@ -28,38 +28,30 @@ CCL_NAMESPACE_BEGIN
 enum ProfilingEvent : uint32_t {
   PROFILING_UNKNOWN,
   PROFILING_RAY_SETUP,
-  PROFILING_PATH_INTEGRATE,
-  PROFILING_SCENE_INTERSECT,
-  PROFILING_INDIRECT_EMISSION,
-  PROFILING_VOLUME,
-  PROFILING_SHADER_SETUP,
-  PROFILING_SHADER_EVAL,
-  PROFILING_SHADER_APPLY,
-  PROFILING_AO,
-  PROFILING_SUBSURFACE,
-  PROFILING_CONNECT_LIGHT,
-  PROFILING_SURFACE_BOUNCE,
-  PROFILING_WRITE_RESULT,
-
-  PROFILING_INTERSECT,
-  PROFILING_INTERSECT_LOCAL,
-  PROFILING_INTERSECT_SHADOW_ALL,
-  PROFILING_INTERSECT_VOLUME,
-  PROFILING_INTERSECT_VOLUME_ALL,
-
-  PROFILING_CLOSURE_EVAL,
-  PROFILING_CLOSURE_SAMPLE,
-  PROFILING_CLOSURE_VOLUME_EVAL,
-  PROFILING_CLOSURE_VOLUME_SAMPLE,
-
-  PROFILING_DENOISING,
-  PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
-  PROFILING_DENOISING_RECONSTRUCT,
-  PROFILING_DENOISING_DIVIDE_SHADOW,
-  PROFILING_DENOISING_NON_LOCAL_MEANS,
-  PROFILING_DENOISING_COMBINE_HALVES,
-  PROFILING_DENOISING_GET_FEATURE,
-  PROFILING_DENOISING_DETECT_OUTLIERS,
+
+  PROFILING_INTERSECT_CLOSEST,
+  PROFILING_INTERSECT_SUBSURFACE,
+  PROFILING_INTERSECT_SHADOW,
+  PROFILING_INTERSECT_VOLUME_STACK,
+
+  PROFILING_SHADE_SURFACE_SETUP,
+  PROFILING_SHADE_SURFACE_EVAL,
+  PROFILING_SHADE_SURFACE_DIRECT_LIGHT,
+  PROFILING_SHADE_SURFACE_INDIRECT_LIGHT,
+  PROFILING_SHADE_SURFACE_AO,
+  PROFILING_SHADE_SURFACE_PASSES,
+
+  PROFILING_SHADE_VOLUME_SETUP,
+  PROFILING_SHADE_VOLUME_INTEGRATE,
+  PROFILING_SHADE_VOLUME_DIRECT_LIGHT,
+  PROFILING_SHADE_VOLUME_INDIRECT_LIGHT,
+
+  PROFILING_SHADE_SHADOW_SETUP,
+  PROFILING_SHADE_SHADOW_SURFACE,
+  PROFILING_SHADE_SHADOW_VOLUME,
+
+  PROFILING_SHADE_LIGHT_SETUP,
+  PROFILING_SHADE_LIGHT_EVAL,
 
   PROFILING_NUM_EVENTS,
 };
@@ -136,37 +128,51 @@ class ProfilingHelper {
     state->event = event;
   }
 
+  ~ProfilingHelper()
+  {
+    state->event = previous_event;
+  }
+
   inline void set_event(ProfilingEvent event)
   {
     state->event = event;
   }
 
-  inline void set_shader(int shader)
+ protected:
+  ProfilingState *state;
+  uint32_t previous_event;
+};
+
+class ProfilingWithShaderHelper : public ProfilingHelper {
+ public:
+  ProfilingWithShaderHelper(ProfilingState *state, ProfilingEvent event)
+      : ProfilingHelper(state, event)
   {
-    state->shader = shader;
-    if (state->active) {
-      assert(shader < state->shader_hits.size());
-      state->shader_hits[shader]++;
-    }
   }
 
-  inline void set_object(int object)
+  ~ProfilingWithShaderHelper()
   {
-    state->object = object;
-    if (state->active) {
-      assert(object < state->object_hits.size());
-      state->object_hits[object]++;
-    }
+    state->object = -1;
+    state->shader = -1;
   }
 
-  ~ProfilingHelper()
+  inline void set_shader(int object, int shader)
   {
-    state->event = previous_event;
+    if (state->active) {
+      state->shader = shader;
+      state->object = object;
+
+      if (shader >= 0) {
+        assert(shader < state->shader_hits.size());
+        state->shader_hits[shader]++;
+      }
+
+      if (object >= 0) {
+        assert(object < state->object_hits.size());
+        state->object_hits[object]++;
+      }
+    }
   }
-
- private:
-  ProfilingState *state;
-  uint32_t previous_event;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 26534a29dfe..dca8d3d0ab5 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -46,7 +46,6 @@ class Progress {
     substatus = "";
     sync_status = "";
     sync_substatus = "";
-    kernel_status = "";
     update_cb = function_null;
     cancel = false;
     cancel_message = "";
@@ -87,7 +86,6 @@ class Progress {
     substatus = "";
     sync_status = "";
     sync_substatus = "";
-    kernel_status = "";
     cancel = false;
     cancel_message = "";
     error = false;
@@ -316,24 +314,6 @@ class Progress {
     }
   }
 
-  /* kernel status */
-
-  void set_kernel_status(const string &kernel_status_)
-  {
-    {
-      thread_scoped_lock lock(progress_mutex);
-      kernel_status = kernel_status_;
-    }
-
-    set_update();
-  }
-
-  void get_kernel_status(string &kernel_status_)
-  {
-    thread_scoped_lock lock(progress_mutex);
-    kernel_status_ = kernel_status;
-  }
-
   /* callback */
 
   void set_update()
@@ -378,8 +358,6 @@ class Progress {
   string sync_status;
   string sync_substatus;
 
-  string kernel_status;
-
   volatile bool cancel;
   string cancel_message;
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 8e8caa98a1b..b4a153c329f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -61,14 +61,14 @@ static struct TrueTy {
   {
     return true;
   }
-} True ccl_maybe_unused;
+} True ccl_attr_maybe_unused;
 
 static struct FalseTy {
   __forceinline operator bool() const
   {
     return false;
   }
-} False ccl_maybe_unused;
+} False ccl_attr_maybe_unused;
 
 static struct ZeroTy {
   __forceinline operator float() const
@@ -79,7 +79,7 @@ static struct ZeroTy {
   {
     return 0;
   }
-} zero ccl_maybe_unused;
+} zero ccl_attr_maybe_unused;
 
 static struct OneTy {
   __forceinline operator float() const
@@ -90,7 +90,7 @@ static struct OneTy {
   {
     return 1;
   }
-} one ccl_maybe_unused;
+} one ccl_attr_maybe_unused;
 
 static struct NegInfTy {
   __forceinline operator float() const
@@ -101,7 +101,7 @@ static struct NegInfTy {
   {
     return std::numeric_limits<int>::min();
   }
-} neg_inf ccl_maybe_unused;
+} neg_inf ccl_attr_maybe_unused;
 
 static struct PosInfTy {
   __forceinline operator float() const
@@ -112,10 +112,10 @@ static struct PosInfTy {
   {
     return std::numeric_limits<int>::max();
   }
-} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
+} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused;
 
 static struct StepTy {
-} step ccl_maybe_unused;
+} step ccl_attr_maybe_unused;
 
 #endif
 
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index d809f2e06d7..7df52d462b7 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -24,9 +24,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
+#if defined(CYCLES_CUBIN_CC)
 #  define static_assert(statement, message)
-#endif /* __KERNEL_OPENCL__ */
+#endif
 
 #define static_assert_align(st, align) \
   static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 4dfebf14923..9c0b2ca50bb 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,6 +17,9 @@
 #include <stdarg.h>
 #include <stdio.h>
 
+#include <algorithm>
+#include <cctype>
+
 #include "util/util_foreach.h"
 #include "util/util_string.h"
 #include "util/util_windows.h"
@@ -107,24 +110,26 @@ void string_split(vector<string> &tokens,
   }
 }
 
-bool string_startswith(const string &s, const char *start)
+bool string_startswith(const string_view s, const string_view start)
 {
-  size_t len = strlen(start);
+  const size_t len = start.size();
 
-  if (len > s.size())
-    return 0;
-  else
-    return strncmp(s.c_str(), start, len) == 0;
+  if (len > s.size()) {
+    return false;
+  }
+
+  return strncmp(s.c_str(), start.data(), len) == 0;
 }
 
-bool string_endswith(const string &s, const string &end)
+bool string_endswith(const string_view s, const string_view end)
 {
-  size_t len = end.length();
+  const size_t len = end.size();
 
-  if (len > s.size())
-    return 0;
-  else
-    return s.compare(s.length() - len, len, end) == 0;
+  if (len > s.size()) {
+    return false;
+  }
+
+  return strncmp(s.c_str() + s.size() - len, end.data(), len) == 0;
 }
 
 string string_strip(const string &s)
@@ -172,6 +177,13 @@ string to_string(const char *str)
   return string(str);
 }
 
+string string_to_lower(const string &s)
+{
+  string r = s;
+  std::transform(r.begin(), r.end(), r.begin(), [](char c) { return std::tolower(c); });
+  return r;
+}
+
 /* Wide char strings helpers for Windows. */
 
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index f2272819b2f..55462cfd8b8 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,6 +21,11 @@
 #include <string.h>
 #include <string>
 
+/* Use string view implementation from OIIO.
+ * Ideally, need to switch to `std::string_view`, but this first requires getting rid of using
+ * namespace OIIO as it causes symbol collision. */
+#include <OpenImageIO/string_view.h>
+
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -31,6 +36,8 @@ using std::string;
 using std::stringstream;
 using std::to_string;
 
+using OIIO::string_view;
+
 #ifdef __GNUC__
 #  define PRINTF_ATTRIBUTE __attribute__((format(printf, 1, 2)))
 #else
@@ -45,12 +52,13 @@ void string_split(vector<string> &tokens,
                   const string &separators = "\t ",
                   bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
-bool string_startswith(const string &s, const char *start);
-bool string_endswith(const string &s, const string &end);
+bool string_startswith(string_view s, string_view start);
+bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string string_to_lower(const string &s);
 
 /* Wide char strings are only used on Windows to deal with non-ASCII
  * characters in file names and such. No reason to use such strings
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index b010881058b..be8c2fb505a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -403,4 +403,13 @@ size_t system_physical_ram()
 #endif
 }
 
+uint64_t system_self_process_id()
+{
+#ifdef _WIN32
+  return GetCurrentProcessId();
+#else
+  return getpid();
+#endif
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index c4db8b74339..a1797e6ca44 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -65,6 +65,9 @@ size_t system_physical_ram();
 /* Start a new process of the current application with the given arguments. */
 bool system_call_self(const vector<string> &args);
 
+/* Get identifier of the currently running process. */
+uint64_t system_self_process_id();
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_SYSTEM_H__ */
diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h
index 73e0f92d19c..8f84377ac8c 100644
--- a/intern/cycles/util/util_tbb.h
+++ b/intern/cycles/util/util_tbb.h
@@ -23,6 +23,7 @@
 
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
+#include <tbb/parallel_for_each.h>
 #include <tbb/task_arena.h>
 #include <tbb/task_group.h>
 
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index 71bf9c65911..4de66bf5f46 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -85,8 +85,6 @@ typedef struct TextureInfo {
   uint64_t data;
   /* Data Type */
   uint data_type;
-  /* Buffer number for OpenCL. */
-  uint cl_buffer;
   /* Interpolation and extension type. */
   uint interpolation, extension;
   /* Dimensions. */
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index f79eac4cbcf..e9cd3b0b483 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -498,36 +498,12 @@ Transform transform_from_viewplane(BoundBox2D &viewplane);
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-
-#  define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a##b
-#  define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \
-    ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \
-        ccl_addr_space const Transform *t, const float3 a) \
-    { \
-      Transform private_tfm = *t; \
-      return function(&private_tfm, a); \
-    }
-
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
-
-#  undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE
-#  undef OPENCL_TRANSFORM_ADDRSPACE_GLUE
-#  define transform_point_auto transform_point_addrspace
-#  define transform_direction_auto transform_direction_addrspace
-#  define transform_direction_transposed_auto transform_direction_transposed_addrspace
-#else
-#  define transform_point_auto transform_point
-#  define transform_direction_auto transform_direction
-#  define transform_direction_transposed_auto transform_direction_transposed
-#endif
+#define transform_point_auto transform_point
+#define transform_direction_auto transform_direction
+#define transform_direction_transposed_auto transform_direction_transposed
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 87358877e3c..442c32b3a3d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -17,9 +17,7 @@
 #ifndef __UTIL_TYPES_H__
 #define __UTIL_TYPES_H__
 
-#ifndef __KERNEL_OPENCL__
-#  include <stdlib.h>
-#endif
+#include <stdlib.h>
 
 /* Standard Integer Types */
 
@@ -44,18 +42,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Shorter Unsigned Names */
 
-#ifndef __KERNEL_OPENCL__
 typedef unsigned char uchar;
 typedef unsigned int uint;
 typedef unsigned short ushort;
-#endif
 
 /* Fixed Bits Types */
 
-#ifdef __KERNEL_OPENCL__
-typedef unsigned long uint64_t;
-#endif
-
 #ifndef __KERNEL_GPU__
 /* Generic Memory Pointer */
 
diff --git a/intern/cycles/util/util_unique_ptr.h b/intern/cycles/util/util_unique_ptr.h
index 3aaaf083eff..3181eafd43d 100644
--- a/intern/cycles/util/util_unique_ptr.h
+++ b/intern/cycles/util/util_unique_ptr.h
@@ -21,6 +21,7 @@
 
 CCL_NAMESPACE_BEGIN
 
+using std::make_unique;
 using std::unique_ptr;
 
 CCL_NAMESPACE_END