137 files changed, 4112 insertions, 11813 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 121c8bdad6e..e5a5e9773d3 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -286,6 +286,7 @@ include_directories(
   ${OPENEXR_INCLUDE_DIR}
   ${OPENEXR_INCLUDE_DIRS}
   ${PUGIXML_INCLUDE_DIR}
+  ${TBB_INCLUDE_DIRS}
 )
 
 if(CYCLES_STANDALONE_REPOSITORY)
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 496e8e9310b..2316800e21e 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -92,10 +92,6 @@ if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
 
-if(WITH_NEW_OBJECT_TYPES)
-  add_definitions(-DWITH_NEW_OBJECT_TYPES)
-endif()
-
 if(WITH_OPENVDB)
   add_definitions(-DWITH_OPENVDB ${OPENVDB_DEFINITIONS})
   list(APPEND INC_SYS
@@ -106,6 +102,13 @@ if(WITH_OPENVDB)
   )
 endif()
 
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  list(APPEND INC_SYS
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+endif()
+
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
 # avoid link failure with clang 3.4 debug
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e7ea5e7a1f6..7566ca28dd7 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -179,7 +179,8 @@ def reset(engine, data, depsgraph):
     import _cycles
     import bpy
 
-    if bpy.app.debug_value == 256:
+    prefs = bpy.context.preferences
+    if prefs.experimental.use_cycles_debug and prefs.view.show_developer_ui:
         _cycles.debug_flags_update(depsgraph.scene.as_pointer())
     else:
         _cycles.debug_flags_reset()
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 1635afab210..840efb65d96 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -55,8 +55,7 @@ enum_displacement_methods = (
 
 enum_bvh_layouts = (
     ('BVH2', "BVH2", "", 1),
-    ('BVH4', "BVH4", "", 2),
-    ('BVH8', "BVH8", "", 4),
+    ('EMBREE', "Embree", "", 4),
 )
 
 enum_bvh_types = (
@@ -78,20 +77,9 @@ enum_panorama_types = (
     ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"),
 )
 
-enum_curve_primitives = (
-    ('TRIANGLES', "Triangles", "Create triangle geometry around strands"),
-    ('LINE_SEGMENTS', "Line Segments", "Use line segment primitives"),
-    ('CURVE_SEGMENTS', "Curve Segments", "Use segmented cardinal curve primitives"),
-)
-
-enum_triangle_curves = (
-    ('CAMERA_TRIANGLES', "Planes", "Create individual triangles forming planes that face camera"),
-    ('TESSELLATED_TRIANGLES', "Tessellated", "Create mesh surrounding each strand"),
-)
-
 enum_curve_shape = (
-    ('RIBBONS', "Ribbons", "Ignore thickness of each strand"),
-    ('THICK', "Thick", "Use thickness of strand when rendering"),
+    ('RIBBONS', "Rounded Ribbons", "Render hair as flat ribbon with rounded normals, for fast rendering"),
+    ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"),
 )
 
 enum_tile_order = (
@@ -194,10 +182,36 @@ enum_aov_types = (
     ('COLOR', "Color", "Write a Color pass", 1),
 )
 
-enum_viewport_denoising = (
-    ('NONE', "None", "Disable viewport denoising", 0),
-    ('OPTIX', "OptiX AI-Accelerated", "Use the OptiX denoiser running on the GPU (requires at least one compatible OptiX device)", 1),
-)
+def enum_openimagedenoise_denoiser(self, context):
+    if _cycles.with_openimagedenoise:
+        return [('OPENIMAGEDENOISE', "OpenImageDenoise", "Use Intel OpenImageDenoise AI denoiser running on the CPU", 4)]
+    return []
+
+def enum_optix_denoiser(self, context):
+    if not context or bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')):
+        return [('OPTIX', "OptiX", "Use the OptiX AI denoiser with GPU acceleration, only available on NVIDIA GPUs", 2)]
+    return []
+
+def enum_preview_denoiser(self, context):
+    optix_items = enum_optix_denoiser(self, context)
+    oidn_items = enum_openimagedenoise_denoiser(self, context)
+
+    if len(optix_items):
+        auto_label = "Fastest (Optix)"
+    elif len(oidn_items):
+        auto_label = "Fastest (OpenImageDenoise)"
+    else:
+        auto_label = "None"
+
+    items = [('AUTO', auto_label, "Use the fastest available denoiser for viewport rendering", 0)]
+    items += optix_items
+    items += oidn_items
+    return items
+
+def enum_denoiser(self, context):
+    items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
+    items += enum_optix_denoiser(self, context)
+    return items
 
 enum_denoising_optix_input_passes = (
     ('RGB', "Color", "Use only color as input", 1),
@@ -236,11 +250,29 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         description="Pause all viewport preview renders",
         default=False,
     )
-    preview_denoising: EnumProperty(
-        name="Viewport Denoising",
-        description="Denoise the image after each preview update with the selected denoiser engine",
-        items=enum_viewport_denoising,
-        default='NONE',
+
+    use_denoising: BoolProperty(
+        name="Use Denoising",
+        description="Denoise the rendered image",
+        default=False,
+    )
+    use_preview_denoising: BoolProperty(
+        name="Use Viewport Denoising",
+        description="Denoise the image in the 3D viewport",
+        default=False,
+    )
+
+    denoiser: EnumProperty(
+        name="Denoiser",
+        description="Denoise the image with the selected denoiser",
+        items=enum_denoiser,
+        default=1,
+    )
+    preview_denoiser: EnumProperty(
+        name="Viewport Denoiser",
+        description="Denoise the image after each preview update with the selected denoiser",
+        items=enum_preview_denoiser,
+        default=0,
     )
 
     use_square_samples: BoolProperty(
@@ -256,7 +288,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=128,
     )
     preview_samples: IntProperty(
-        name="Preview Samples",
+        name="Viewport Samples",
         description="Number of samples to render in the viewport, unlimited if 0",
         min=0, max=(1 << 24),
         default=32,
@@ -476,7 +508,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         subtype='PIXEL'
     )
     preview_dicing_rate: FloatProperty(
-        name="Preview Dicing Rate",
+        name="Viewport Dicing Rate",
         description="Size of a micropolygon in pixels during preview render",
         min=0.1, max=1000.0, soft_min=0.5,
         default=8.0,
@@ -629,11 +661,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         items=enum_bvh_types,
         default='DYNAMIC_BVH',
     )
-    use_bvh_embree: BoolProperty(
-        name="Use Embree",
-        description="Use Embree as ray accelerator",
-        default=False,
-    )
     debug_use_spatial_splits: BoolProperty(
         name="Use Spatial Splits",
         description="Use BVH spatial splits: longer builder time, faster render",
@@ -786,7 +813,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     debug_bvh_layout: EnumProperty(
         name="BVH Layout",
         items=enum_bvh_layouts,
-        default='BVH8',
+        default='EMBREE',
     )
     debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
@@ -1241,39 +1268,17 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
 
 class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
 
-    primitive: EnumProperty(
-        name="Primitive",
-        description="Type of primitive used for hair rendering",
-        items=enum_curve_primitives,
-        default='LINE_SEGMENTS',
-    )
     shape: EnumProperty(
         name="Shape",
         description="Form of hair",
         items=enum_curve_shape,
-        default='THICK',
-    )
-    cull_backfacing: BoolProperty(
-        name="Cull Back-faces",
-        description="Do not test the back-face of each strand",
-        default=True,
-    )
-    use_curves: BoolProperty(
-        name="Use Cycles Hair Rendering",
-        description="Activate Cycles hair rendering for particle system",
-        default=True,
-    )
-    resolution: IntProperty(
-        name="Resolution",
-        description="Resolution of generated mesh",
-        min=3, max=64,
-        default=3,
+        default='RIBBONS',
     )
     subdivisions: IntProperty(
         name="Subdivisions",
         description="Number of subdivisions used in Cardinal curve intersection (power of 2)",
         min=0, max=24,
-        default=4,
+        default=2,
     )
 
     @classmethod
@@ -1369,7 +1374,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
-        default=False,
+        default=True,
         update=update_render_passes,
     )
     denoising_diffuse_direct: BoolProperty(
@@ -1439,12 +1444,6 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=0,
     )
 
-    use_optix_denoising: BoolProperty(
-        name="OptiX AI-Accelerated",
-        description="Use the OptiX denoiser to denoise the rendered image",
-        default=False,
-        update=update_render_passes,
-    )
     denoising_optix_input_passes: EnumProperty(
         name="Input Passes",
         description="Passes handed over to the OptiX denoiser (this can have different effects on the denoised image)",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 78a44881743..b049d0bf2b4 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -112,10 +112,6 @@ def show_device_active(context):
         return True
     return context.preferences.addons[__package__].preferences.has_active_device()
 
-def show_optix_denoising(context):
-    # OptiX AI denoiser can be used when at least one device supports OptiX
-    return bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'))
-
 
 def draw_samples_info(layout, context):
     cscene = context.scene.cycles
@@ -190,11 +186,6 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
             col.prop(cscene, "aa_samples", text="Render")
             col.prop(cscene, "preview_aa_samples", text="Viewport")
 
-        # Viewport denoising is currently only supported with OptiX
-        if show_optix_denoising(context):
-            col = layout.column()
-            col.prop(cscene, "preview_denoising")
-
         if not use_branched_path(context):
             draw_samples_info(layout, context)
 
@@ -256,6 +247,39 @@ class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
         col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
         col.prop(cscene, "adaptive_min_samples", text="Min Samples")
 
+
+class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
+    bl_label = "Denoising"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        heading = layout.column(align=True, heading="Render")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_denoising", text="")
+        sub = row.row()
+        sub.active = cscene.use_denoising
+        sub.prop(cscene, "denoiser", text="")
+
+        heading = layout.column(align=False, heading="Viewport")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_preview_denoising", text="")
+        sub = row.row()
+        sub.active = cscene.use_preview_denoising
+        sub.prop(cscene, "preview_denoiser", text="")
+
+        sub = heading.row(align=True)
+        sub.active = cscene.use_preview_denoising
+        sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
+
+
 class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
     bl_label = "Advanced"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
@@ -387,13 +411,6 @@ class CYCLES_RENDER_PT_hair(CyclesButtonsPanel, Panel):
     bl_label = "Hair"
     bl_options = {'DEFAULT_CLOSED'}
 
-    def draw_header(self, context):
-        layout = self.layout
-        scene = context.scene
-        ccscene = scene.cycles_curves
-
-        layout.prop(ccscene, "use_curves", text="")
-
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -402,18 +419,10 @@ class CYCLES_RENDER_PT_hair(CyclesButtonsPanel, Panel):
         scene = context.scene
         ccscene = scene.cycles_curves
 
-        layout.active = ccscene.use_curves
-
         col = layout.column()
         col.prop(ccscene, "shape", text="Shape")
-        if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'):
-            col.prop(ccscene, "cull_backfacing", text="Cull back-faces")
-        col.prop(ccscene, "primitive", text="Primitive")
-
-        if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK':
-            col.prop(ccscene, "resolution", text="Resolution")
-        elif ccscene.primitive == 'CURVE_SEGMENTS':
-            col.prop(ccscene, "subdivisions", text="Curve subdivisions")
+        if ccscene.shape == 'RIBBONS':
+            col.prop(ccscene, "subdivisions", text="Curve Subdivisions")
 
 
 class CYCLES_RENDER_PT_volumes(CyclesButtonsPanel, Panel):
@@ -693,16 +702,20 @@ class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Pa
 
         col = layout.column()
 
-        if _cycles.with_embree:
-            row = col.row()
-            row.active = use_cpu(context)
-            row.prop(cscene, "use_bvh_embree")
+        use_embree = False
+        if use_cpu(context):
+            use_embree = _cycles.with_embree
+            if not use_embree:
+              sub = col.column(align=True)
+              sub.label(text="Cycles built without Embree support")
+              sub.label(text="CPU raytracing performance will be poor")
+
         col.prop(cscene, "debug_use_spatial_splits")
         sub = col.column()
-        sub.active = not cscene.use_bvh_embree or not _cycles.with_embree
+        sub.active = not use_embree
         sub.prop(cscene, "debug_use_hair_bvh")
         sub = col.column()
-        sub.active = not cscene.debug_use_spatial_splits and not cscene.use_bvh_embree
+        sub.active = not cscene.debug_use_spatial_splits and not use_embree
         sub.prop(cscene, "debug_bvh_time_steps")
 
 
@@ -741,11 +754,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
         col.prop(rd, "preview_pixel_size", text="Pixel Size")
         col.prop(cscene, "preview_start_resolution", text="Start Pixels")
 
-        if show_optix_denoising(context):
-            sub = col.row(align=True)
-            sub.active = cscene.preview_denoising != 'NONE'
-            sub.prop(cscene, "preview_denoising_start_sample", text="Denoising Start Sample")
-
 
 class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
     bl_label = "Filter"
@@ -968,12 +976,17 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
     bl_context = "view_layer"
     bl_options = {'DEFAULT_CLOSED'}
 
+    @classmethod
+    def poll(cls, context):
+        cscene = context.scene.cycles
+        return CyclesButtonsPanel.poll(context) and cscene.use_denoising
+
     def draw_header(self, context):
         scene = context.scene
         view_layer = context.view_layer
         cycles_view_layer = view_layer.cycles
-        layout = self.layout
 
+        layout = self.layout
         layout.prop(cycles_view_layer, "use_denoising", text="")
 
     def draw(self, context):
@@ -984,18 +997,17 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
         scene = context.scene
         view_layer = context.view_layer
         cycles_view_layer = view_layer.cycles
+        denoiser = scene.cycles.denoiser
 
-        layout.active = cycles_view_layer.use_denoising
+        layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising
 
         col = layout.column()
 
-        if show_optix_denoising(context):
-            col.prop(cycles_view_layer, "use_optix_denoising")
-            col.separator(factor=2.0)
-
-            if cycles_view_layer.use_optix_denoising:
-                col.prop(cycles_view_layer, "denoising_optix_input_passes")
-                return
+        if denoiser == 'OPTIX':
+            col.prop(cycles_view_layer, "denoising_optix_input_passes")
+            return
+        elif denoiser == 'OPENIMAGEDENOISE':
+            return
 
         col.prop(cycles_view_layer, "denoising_radius", text="Radius")
 
@@ -1190,6 +1202,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
 
     def draw(self, context):
         layout = self.layout
+        layout.use_property_split = True
 
         rd = context.scene.render
         # scene = context.scene
@@ -1199,10 +1212,10 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
 
         layout.active = (rd.use_motion_blur and cob.use_motion_blur)
 
-        row = layout.row()
+        col = layout.column()
+        col.prop(cob, "motion_steps", text="Steps")
         if ob.type != 'CAMERA':
-            row.prop(cob, "use_deform_motion", text="Deformation")
-        row.prop(cob, "motion_steps", text="Steps")
+            col.prop(cob, "use_deform_motion", text="Deformation")
 
 
 def has_geometry_visibility(ob):
@@ -1575,17 +1588,18 @@ class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel):
 
     def draw(self, context):
         layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
 
         world = context.world
         visibility = world.cycles_visibility
 
-        flow = layout.column_flow()
-
-        flow.prop(visibility, "camera")
-        flow.prop(visibility, "diffuse")
-        flow.prop(visibility, "glossy")
-        flow.prop(visibility, "transmission")
-        flow.prop(visibility, "scatter")
+        col = layout.column()
+        col.prop(visibility, "camera")
+        col.prop(visibility, "diffuse")
+        col.prop(visibility, "glossy")
+        col.prop(visibility, "transmission")
+        col.prop(visibility, "scatter")
 
 
 class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel):
@@ -1975,7 +1989,10 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        return CyclesButtonsPanel.poll(context) and bpy.app.debug_value == 256
+        prefs = bpy.context.preferences
+        return (CyclesButtonsPanel.poll(context)
+                and prefs.experimental.use_cycles_debug
+                and prefs.view.show_developer_ui)
 
     def draw(self, context):
         layout = self.layout
@@ -2248,6 +2265,7 @@ classes = (
     CYCLES_RENDER_PT_sampling,
     CYCLES_RENDER_PT_sampling_sub_samples,
     CYCLES_RENDER_PT_sampling_adaptive,
+    CYCLES_RENDER_PT_sampling_denoising,
     CYCLES_RENDER_PT_sampling_advanced,
     CYCLES_RENDER_PT_light_paths,
     CYCLES_RENDER_PT_light_paths_max_bounces,
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 40a1a2c2edc..011678a7a65 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -867,13 +867,13 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d,
   }
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::Scene &b_scene,
-                                            BL::RenderSettings &b_render,
+BufferParams BlenderSync::get_buffer_params(BL::RenderSettings &b_render,
                                             BL::SpaceView3D &b_v3d,
                                             BL::RegionView3D &b_rv3d,
                                             Camera *cam,
                                             int width,
-                                            int height)
+                                            int height,
+                                            const bool use_denoiser)
 {
   BufferParams params;
   bool use_border = false;
@@ -907,8 +907,7 @@ BufferParams BlenderSync::get_buffer_params(BL::Scene &b_scene,
   PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
 
   /* Can only denoise the combined image pass */
-  params.denoising_data_pass = display_pass == PASS_COMBINED &&
-                               update_viewport_display_denoising(b_v3d, b_scene);
+  params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser;
 
   return params;
 }
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 847a43c5f34..82c99631a89 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -18,7 +18,6 @@
 #include "render/camera.h"
 #include "render/curves.h"
 #include "render/hair.h"
-#include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
 
@@ -39,27 +38,6 @@ ParticleCurveData::~ParticleCurveData()
 {
 }
 
-static void interp_weights(float t, float data[4])
-{
-  /* Cardinal curve interpolation */
-  float t2 = t * t;
-  float t3 = t2 * t;
-  float fc = 0.71f;
-
-  data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t;
-  data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f;
-  data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t;
-  data[3] = fc * t3 - fc * t2;
-}
-
-static void curveinterp_v3_v3v3v3v3(
-    float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4])
-{
-  p->x = v1->x * w[0] + v2->x * w[1] + v3->x * w[2] + v4->x * w[3];
-  p->y = v1->y * w[0] + v2->y * w[1] + v3->y * w[2] + v4->y * w[3];
-  p->z = v1->z * w[0] + v2->z * w[1] + v3->z * w[2] + v4->z * w[3];
-}
-
 static float shaperadius(float shape, float root, float tip, float time)
 {
   assert(time >= 0.0f);
@@ -77,43 +55,13 @@ static float shaperadius(float shape, float root, float tip, float time)
 
 /* curve functions */
 
-static void InterpolateKeySegments(
-    int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData)
-{
-  float3 ckey_loc1 = CData->curvekey_co[key];
-  float3 ckey_loc2 = ckey_loc1;
-  float3 ckey_loc3 = CData->curvekey_co[key + 1];
-  float3 ckey_loc4 = ckey_loc3;
-
-  if (key > CData->curve_firstkey[curve])
-    ckey_loc1 = CData->curvekey_co[key - 1];
-
-  if (key < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2)
-    ckey_loc4 = CData->curvekey_co[key + 2];
-
-  float time1 = CData->curvekey_time[key] / CData->curve_length[curve];
-  float time2 = CData->curvekey_time[key + 1] / CData->curve_length[curve];
-
-  float dfra = (time2 - time1) / (float)segno;
-
-  if (time)
-    *time = (dfra * seg) + time1;
-
-  float t[4];
-
-  interp_weights((float)seg / (float)segno, t);
-
-  if (keyloc)
-    curveinterp_v3_v3v3v3v3(keyloc, &ckey_loc1, &ckey_loc2, &ckey_loc3, &ckey_loc4, t);
-}
-
 static bool ObtainCacheParticleData(
-    Geometry *geom, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
+    Hair *hair, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
 {
   int curvenum = 0;
   int keyno = 0;
 
-  if (!(geom && b_mesh && b_ob && CData))
+  if (!(hair && b_mesh && b_ob && CData))
     return false;
 
   Transform tfm = get_transform(b_ob->matrix_world());
@@ -129,7 +77,7 @@ static bool ObtainCacheParticleData(
 
       if ((b_part.render_type() == BL::ParticleSettings::render_type_PATH) &&
           (b_part.type() == BL::ParticleSettings::type_HAIR)) {
-        int shader = clamp(b_part.material() - 1, 0, geom->used_shaders.size() - 1);
+        int shader = clamp(b_part.material() - 1, 0, hair->used_shaders.size() - 1);
         int display_step = background ? b_part.render_step() : b_part.display_step();
         int totparts = b_psys.particles.length();
         int totchild = background ? b_psys.child_particles.length() :
@@ -203,14 +151,14 @@ static bool ObtainCacheParticleData(
   return true;
 }
 
-static bool ObtainCacheParticleUV(Geometry *geom,
+static bool ObtainCacheParticleUV(Hair *hair,
                                   BL::Mesh *b_mesh,
                                   BL::Object *b_ob,
                                   ParticleCurveData *CData,
                                   bool background,
                                   int uv_num)
 {
-  if (!(geom && b_mesh && b_ob && CData))
+  if (!(hair && b_mesh && b_ob && CData))
     return false;
 
   CData->curve_uv.clear();
@@ -266,14 +214,14 @@ static bool ObtainCacheParticleUV(Geometry *geom,
   return true;
 }
 
-static bool ObtainCacheParticleVcol(Geometry *geom,
+static bool ObtainCacheParticleVcol(Hair *hair,
                                     BL::Mesh *b_mesh,
                                     BL::Object *b_ob,
                                     ParticleCurveData *CData,
                                     bool background,
                                     int vcol_num)
 {
-  if (!(geom && b_mesh && b_ob && CData))
+  if (!(hair && b_mesh && b_ob && CData))
     return false;
 
   CData->curve_vcol.clear();
@@ -314,7 +262,7 @@ static bool ObtainCacheParticleVcol(Geometry *geom,
           BL::Mesh::vertex_colors_iterator l;
           b_mesh->vertex_colors.begin(l);
 
-          float3 vcol = make_float3(0.0f, 0.0f, 0.0f);
+          float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
           if (b_mesh->vertex_colors.length())
             b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x);
           CData->curve_vcol.push_back_slow(vcol);
@@ -329,272 +277,6 @@ static bool ObtainCacheParticleVcol(Geometry *geom,
   return true;
 }
 
-static void ExportCurveTrianglePlanes(Mesh *mesh,
-                                      ParticleCurveData *CData,
-                                      float3 RotCam,
-                                      bool is_ortho)
-{
-  int vertexno = mesh->verts.size();
-  int vertexindex = vertexno;
-  int numverts = 0, numtris = 0;
-
-  /* compute and reserve size of arrays */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      numverts += 2 + (CData->curve_keynum[curve] - 1) * 2;
-      numtris += (CData->curve_keynum[curve] - 1) * 2;
-    }
-  }
-
-  mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);
-
-  /* actually export */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      float3 xbasis;
-      float3 v1;
-      float time = 0.0f;
-      float3 ickey_loc = CData->curvekey_co[CData->curve_firstkey[curve]];
-      float radius = shaperadius(
-          CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], 0.0f);
-      v1 = CData->curvekey_co[CData->curve_firstkey[curve] + 1] -
-           CData->curvekey_co[CData->curve_firstkey[curve]];
-      if (is_ortho)
-        xbasis = normalize(cross(RotCam, v1));
-      else
-        xbasis = normalize(cross(RotCam - ickey_loc, v1));
-      float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
-      float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
-      mesh->add_vertex(ickey_loc_shfl);
-      mesh->add_vertex(ickey_loc_shfr);
-      vertexindex += 2;
-
-      for (int curvekey = CData->curve_firstkey[curve] + 1;
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve];
-           curvekey++) {
-        ickey_loc = CData->curvekey_co[curvekey];
-
-        if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)
-          v1 = CData->curvekey_co[curvekey] -
-               CData->curvekey_co[max(curvekey - 1, CData->curve_firstkey[curve])];
-        else
-          v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey - 1];
-
-        time = CData->curvekey_time[curvekey] / CData->curve_length[curve];
-        radius = shaperadius(
-            CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], time);
-
-        if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)
-          radius = shaperadius(CData->psys_shape[sys],
-                               CData->psys_rootradius[sys],
-                               CData->psys_tipradius[sys],
-                               0.95f);
-
-        if (CData->psys_closetip[sys] &&
-            (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1))
-          radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f);
-
-        if (is_ortho)
-          xbasis = normalize(cross(RotCam, v1));
-        else
-          xbasis = normalize(cross(RotCam - ickey_loc, v1));
-        float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
-        float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
-        mesh->add_vertex(ickey_loc_shfl);
-        mesh->add_vertex(ickey_loc_shfr);
-        mesh->add_triangle(
-            vertexindex - 2, vertexindex, vertexindex - 1, CData->psys_shader[sys], true);
-        mesh->add_triangle(
-            vertexindex + 1, vertexindex - 1, vertexindex, CData->psys_shader[sys], true);
-        vertexindex += 2;
-      }
-    }
-  }
-
-  mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
-  mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-  mesh->add_face_normals();
-  mesh->add_vertex_normals();
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-
-  /* texture coords still needed */
-}
-
-static void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution)
-{
-  int vertexno = mesh->verts.size();
-  int vertexindex = vertexno;
-  int numverts = 0, numtris = 0;
-
-  /* compute and reserve size of arrays */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      numverts += (CData->curve_keynum[curve] - 1) * resolution + resolution;
-      numtris += (CData->curve_keynum[curve] - 1) * 2 * resolution;
-    }
-  }
-
-  mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);
-
-  /* actually export */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      float3 firstxbasis = cross(make_float3(1.0f, 0.0f, 0.0f),
-                                 CData->curvekey_co[CData->curve_firstkey[curve] + 1] -
-                                     CData->curvekey_co[CData->curve_firstkey[curve]]);
-      if (!is_zero(firstxbasis))
-        firstxbasis = normalize(firstxbasis);
-      else
-        firstxbasis = normalize(cross(make_float3(0.0f, 1.0f, 0.0f),
-                                      CData->curvekey_co[CData->curve_firstkey[curve] + 1] -
-                                          CData->curvekey_co[CData->curve_firstkey[curve]]));
-
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        float3 xbasis = firstxbasis;
-        float3 v1;
-        float3 v2;
-
-        if (curvekey == CData->curve_firstkey[curve]) {
-          v1 = CData->curvekey_co[min(
-                   curvekey + 2, CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)] -
-               CData->curvekey_co[curvekey + 1];
-          v2 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-        }
-        else if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) {
-          v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-          v2 = CData->curvekey_co[curvekey - 1] -
-               CData->curvekey_co[max(curvekey - 2, CData->curve_firstkey[curve])];
-        }
-        else {
-          v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-          v2 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-        }
-
-        xbasis = cross(v1, v2);
-
-        if (len_squared(xbasis) >= 0.05f * len_squared(v1) * len_squared(v2)) {
-          firstxbasis = normalize(xbasis);
-          break;
-        }
-      }
-
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        int subv = 1;
-        float3 xbasis;
-        float3 ybasis;
-        float3 v1;
-        float3 v2;
-
-        if (curvekey == CData->curve_firstkey[curve]) {
-          subv = 0;
-          v1 = CData->curvekey_co[min(
-                   curvekey + 2, CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)] -
-               CData->curvekey_co[curvekey + 1];
-          v2 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-        }
-        else if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) {
-          v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-          v2 = CData->curvekey_co[curvekey - 1] -
-               CData->curvekey_co[max(curvekey - 2, CData->curve_firstkey[curve])];
-        }
-        else {
-          v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-          v2 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-        }
-
-        xbasis = cross(v1, v2);
-
-        if (len_squared(xbasis) >= 0.05f * len_squared(v1) * len_squared(v2)) {
-          xbasis = normalize(xbasis);
-          firstxbasis = xbasis;
-        }
-        else
-          xbasis = firstxbasis;
-
-        ybasis = normalize(cross(xbasis, v2));
-
-        for (; subv <= 1; subv++) {
-          float3 ickey_loc = make_float3(0.0f, 0.0f, 0.0f);
-          float time = 0.0f;
-
-          InterpolateKeySegments(subv, 1, curvekey, curve, &ickey_loc, &time, CData);
-
-          float radius = shaperadius(CData->psys_shape[sys],
-                                     CData->psys_rootradius[sys],
-                                     CData->psys_tipradius[sys],
-                                     time);
-
-          if ((curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2) &&
-              (subv == 1))
-            radius = shaperadius(CData->psys_shape[sys],
-                                 CData->psys_rootradius[sys],
-                                 CData->psys_tipradius[sys],
-                                 0.95f);
-
-          if (CData->psys_closetip[sys] && (subv == 1) &&
-              (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2))
-            radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f);
-
-          float angle = M_2PI_F / (float)resolution;
-          for (int section = 0; section < resolution; section++) {
-            float3 ickey_loc_shf = ickey_loc + radius * (cosf(angle * section) * xbasis +
-                                                         sinf(angle * section) * ybasis);
-            mesh->add_vertex(ickey_loc_shf);
-          }
-
-          if (subv != 0) {
-            for (int section = 0; section < resolution - 1; section++) {
-              mesh->add_triangle(vertexindex - resolution + section,
-                                 vertexindex + section,
-                                 vertexindex - resolution + section + 1,
-                                 CData->psys_shader[sys],
-                                 true);
-              mesh->add_triangle(vertexindex + section + 1,
-                                 vertexindex - resolution + section + 1,
-                                 vertexindex + section,
-                                 CData->psys_shader[sys],
-                                 true);
-            }
-            mesh->add_triangle(vertexindex - 1,
-                               vertexindex + resolution - 1,
-                               vertexindex - resolution,
-                               CData->psys_shader[sys],
-                               true);
-            mesh->add_triangle(vertexindex,
-                               vertexindex - resolution,
-                               vertexindex + resolution - 1,
-                               CData->psys_shader[sys],
-                               true);
-          }
-          vertexindex += resolution;
-        }
-      }
-    }
-  }
-
-  mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
-  mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-  mesh->add_face_normals();
-  mesh->add_vertex_normals();
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-
-  /* texture coords still needed */
-}
-
 static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CData)
 {
   int num_keys = 0;
@@ -823,154 +505,8 @@ static void ExportCurveSegmentsMotion(Hair *hair, ParticleCurveData *CData, int
   }
 }
 
-static void ExportCurveTriangleUV(ParticleCurveData *CData, int resol, float2 *uvdata)
-{
-  if (uvdata == NULL)
-    return;
-  int vertexindex = 0;
-
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        for (int section = 0; section < resol; section++) {
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-        }
-      }
-    }
-  }
-}
-
-static void ExportCurveTriangleVcol(ParticleCurveData *CData, int resol, uchar4 *cdata)
-{
-  if (cdata == NULL)
-    return;
-
-  int vertexindex = 0;
-
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        for (int section = 0; section < resol; section++) {
-          /* Encode vertex color using the sRGB curve. */
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-        }
-      }
-    }
-  }
-}
-
 /* Hair Curve Sync */
 
-void BlenderSync::sync_curve_settings(BL::Depsgraph &b_depsgraph)
-{
-  PointerRNA csscene = RNA_pointer_get(&b_scene.ptr, "cycles_curves");
-
-  CurveSystemManager *curve_system_manager = scene->curve_system_manager;
-  CurveSystemManager prev_curve_system_manager = *curve_system_manager;
-
-  curve_system_manager->use_curves = get_boolean(csscene, "use_curves");
-
-  curve_system_manager->primitive = (CurvePrimitiveType)get_enum(
-      csscene, "primitive", CURVE_NUM_PRIMITIVE_TYPES, CURVE_LINE_SEGMENTS);
-  curve_system_manager->curve_shape = (CurveShapeType)get_enum(
-      csscene, "shape", CURVE_NUM_SHAPE_TYPES, CURVE_THICK);
-  curve_system_manager->resolution = get_int(csscene, "resolution");
-  curve_system_manager->subdivisions = get_int(csscene, "subdivisions");
-  curve_system_manager->use_backfacing = !get_boolean(csscene, "cull_backfacing");
-
-  /* Triangles */
-  if (curve_system_manager->primitive == CURVE_TRIANGLES) {
-    /* camera facing planes */
-    if (curve_system_manager->curve_shape == CURVE_RIBBON) {
-      curve_system_manager->triangle_method = CURVE_CAMERA_TRIANGLES;
-      curve_system_manager->resolution = 1;
-    }
-    else if (curve_system_manager->curve_shape == CURVE_THICK) {
-      curve_system_manager->triangle_method = CURVE_TESSELATED_TRIANGLES;
-    }
-  }
-  /* Line Segments */
-  else if (curve_system_manager->primitive == CURVE_LINE_SEGMENTS) {
-    if (curve_system_manager->curve_shape == CURVE_RIBBON) {
-      /* tangent shading */
-      curve_system_manager->line_method = CURVE_UNCORRECTED;
-      curve_system_manager->use_encasing = true;
-      curve_system_manager->use_backfacing = false;
-      curve_system_manager->use_tangent_normal_geometry = true;
-    }
-    else if (curve_system_manager->curve_shape == CURVE_THICK) {
-      curve_system_manager->line_method = CURVE_ACCURATE;
-      curve_system_manager->use_encasing = false;
-      curve_system_manager->use_tangent_normal_geometry = false;
-    }
-  }
-  /* Curve Segments */
-  else if (curve_system_manager->primitive == CURVE_SEGMENTS) {
-    if (curve_system_manager->curve_shape == CURVE_RIBBON) {
-      curve_system_manager->primitive = CURVE_RIBBONS;
-      curve_system_manager->use_backfacing = false;
-    }
-  }
-
-  if (curve_system_manager->modified_mesh(prev_curve_system_manager)) {
-    BL::Depsgraph::objects_iterator b_ob;
-
-    for (b_depsgraph.objects.begin(b_ob); b_ob != b_data.objects.end(); ++b_ob) {
-      if (object_is_mesh(*b_ob)) {
-        BL::Object::particle_systems_iterator b_psys;
-        for (b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end();
-             ++b_psys) {
-          if ((b_psys->settings().render_type() == BL::ParticleSettings::render_type_PATH) &&
-              (b_psys->settings().type() == BL::ParticleSettings::type_HAIR)) {
-            BL::ID key = BKE_object_is_modified(*b_ob) ? *b_ob : b_ob->data();
-            geometry_map.set_recalc(key);
-            object_map.set_recalc(*b_ob);
-          }
-        }
-      }
-    }
-  }
-
-  if (curve_system_manager->modified(prev_curve_system_manager))
-    curve_system_manager->tag_update(scene);
-}
-
 bool BlenderSync::object_has_particle_hair(BL::Object b_ob)
 {
   /* Test if the object has a particle modifier with hair. */
@@ -994,78 +530,38 @@ bool BlenderSync::object_has_particle_hair(BL::Object b_ob)
 
 /* Old particle hair. */
 void BlenderSync::sync_particle_hair(
-    Geometry *geom, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step)
+    Hair *hair, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step)
 {
-  Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL;
-  Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL;
-
   /* obtain general settings */
   if (b_ob.mode() == b_ob.mode_PARTICLE_EDIT || b_ob.mode() == b_ob.mode_EDIT) {
     return;
   }
 
-  const int triangle_method = scene->curve_system_manager->triangle_method;
-  const int resolution = scene->curve_system_manager->resolution;
-  int used_res = 1;
-
   /* extract particle hair data - should be combined with connecting to mesh later*/
 
   ParticleCurveData CData;
 
-  ObtainCacheParticleData(geom, &b_mesh, &b_ob, &CData, !preview);
-
-  /* add hair geometry to mesh */
-  if (mesh) {
-    if (triangle_method == CURVE_CAMERA_TRIANGLES) {
-      /* obtain camera parameters */
-      float3 RotCam;
-      Camera *camera = scene->camera;
-      Transform &ctfm = camera->matrix;
-      if (camera->type == CAMERA_ORTHOGRAPHIC) {
-        RotCam = -make_float3(ctfm.x.z, ctfm.y.z, ctfm.z.z);
-      }
-      else {
-        Transform tfm = get_transform(b_ob.matrix_world());
-        Transform itfm = transform_quick_inverse(tfm);
-        RotCam = transform_point(&itfm, make_float3(ctfm.x.w, ctfm.y.w, ctfm.z.w));
-      }
-      bool is_ortho = camera->type == CAMERA_ORTHOGRAPHIC;
-      ExportCurveTrianglePlanes(mesh, &CData, RotCam, is_ortho);
-    }
-    else {
-      ExportCurveTriangleGeometry(mesh, &CData, resolution);
-      used_res = resolution;
-    }
-  }
-  else {
-    if (motion)
-      ExportCurveSegmentsMotion(hair, &CData, motion_step);
-    else
-      ExportCurveSegments(scene, hair, &CData);
-  }
+  ObtainCacheParticleData(hair, &b_mesh, &b_ob, &CData, !preview);
+
+  /* add hair geometry */
+  if (motion)
+    ExportCurveSegmentsMotion(hair, &CData, motion_step);
+  else
+    ExportCurveSegments(scene, hair, &CData);
 
   /* generated coordinates from first key. we should ideally get this from
    * blender to handle deforming objects */
   if (!motion) {
-    if (geom->need_attribute(scene, ATTR_STD_GENERATED)) {
+    if (hair->need_attribute(scene, ATTR_STD_GENERATED)) {
       float3 loc, size;
       mesh_texture_space(b_mesh, loc, size);
 
-      if (mesh) {
-        Attribute *attr_generated = mesh->attributes.add(ATTR_STD_GENERATED);
-        float3 *generated = attr_generated->data_float3();
-
-        for (size_t i = 0; i < mesh->verts.size(); i++)
-          generated[i] = mesh->verts[i] * size - loc;
-      }
-      else {
-        Attribute *attr_generated = hair->attributes.add(ATTR_STD_GENERATED);
-        float3 *generated = attr_generated->data_float3();
+      Attribute *attr_generated = hair->attributes.add(ATTR_STD_GENERATED);
+      float3 *generated = attr_generated->data_float3();
 
-        for (size_t i = 0; i < hair->num_curves(); i++) {
-          float3 co = hair->curve_keys[hair->get_curve(i).first_key];
-          generated[i] = co * size - loc;
-        }
+      for (size_t i = 0; i < hair->num_curves(); i++) {
+        float3 co = hair->curve_keys[hair->get_curve(i).first_key];
+        generated[i] = co * size - loc;
       }
     }
   }
@@ -1076,32 +572,22 @@ void BlenderSync::sync_particle_hair(
     int vcol_num = 0;
 
     for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l, vcol_num++) {
-      if (!geom->need_attribute(scene, ustring(l->name().c_str())))
+      if (!hair->need_attribute(scene, ustring(l->name().c_str())))
         continue;
 
-      ObtainCacheParticleVcol(geom, &b_mesh, &b_ob, &CData, !preview, vcol_num);
+      ObtainCacheParticleVcol(hair, &b_mesh, &b_ob, &CData, !preview, vcol_num);
 
-      if (mesh) {
-        Attribute *attr_vcol = mesh->attributes.add(
-            ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
+      Attribute *attr_vcol = hair->attributes.add(
+          ustring(l->name().c_str()), TypeRGBA, ATTR_ELEMENT_CURVE);
 
-        uchar4 *cdata = attr_vcol->data_uchar4();
+      float4 *fdata = attr_vcol->data_float4();
 
-        ExportCurveTriangleVcol(&CData, used_res, cdata);
-      }
-      else {
-        Attribute *attr_vcol = hair->attributes.add(
-            ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CURVE);
-
-        float3 *fdata = attr_vcol->data_float3();
+      if (fdata) {
+        size_t i = 0;
 
-        if (fdata) {
-          size_t i = 0;
-
-          /* Encode vertex color using the sRGB curve. */
-          for (size_t curve = 0; curve < CData.curve_vcol.size(); curve++) {
-            fdata[i++] = color_srgb_to_linear_v3(CData.curve_vcol[curve]);
-          }
+        /* Encode vertex color using the sRGB curve. */
+        for (size_t curve = 0; curve < CData.curve_vcol.size(); curve++) {
+          fdata[i++] = color_srgb_to_linear_v4(CData.curve_vcol[curve]);
         }
       }
     }
@@ -1118,35 +604,23 @@ void BlenderSync::sync_particle_hair(
       ustring name = ustring(l->name().c_str());
 
       /* UV map */
-      if (geom->need_attribute(scene, name) || geom->need_attribute(scene, std)) {
+      if (hair->need_attribute(scene, name) || hair->need_attribute(scene, std)) {
         Attribute *attr_uv;
 
-        ObtainCacheParticleUV(geom, &b_mesh, &b_ob, &CData, !preview, uv_num);
+        ObtainCacheParticleUV(hair, &b_mesh, &b_ob, &CData, !preview, uv_num);
 
-        if (mesh) {
-          if (active_render)
-            attr_uv = mesh->attributes.add(std, name);
-          else
-            attr_uv = mesh->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CORNER);
-
-          float2 *uv = attr_uv->data_float2();
-
-          ExportCurveTriangleUV(&CData, used_res, uv);
-        }
-        else {
-          if (active_render)
-            attr_uv = hair->attributes.add(std, name);
-          else
-            attr_uv = hair->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
+        if (active_render)
+          attr_uv = hair->attributes.add(std, name);
+        else
+          attr_uv = hair->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
 
-          float2 *uv = attr_uv->data_float2();
+        float2 *uv = attr_uv->data_float2();
 
-          if (uv) {
-            size_t i = 0;
+        if (uv) {
+          size_t i = 0;
 
-            for (size_t curve = 0; curve < CData.curve_uv.size(); curve++) {
-              uv[i++] = CData.curve_uv[curve];
-            }
+          for (size_t curve = 0; curve < CData.curve_uv.size(); curve++) {
+            uv[i++] = CData.curve_uv[curve];
           }
         }
       }
@@ -1154,7 +628,6 @@ void BlenderSync::sync_particle_hair(
   }
 }
 
-#ifdef WITH_NEW_OBJECT_TYPES
 static float4 hair_point_as_float4(BL::HairPoint b_point)
 {
   float4 mP = float3_to_float4(get_float3(b_point.co()));
@@ -1320,12 +793,10 @@ static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_st
     export_hair_motion_validate_attribute(hair, motion_step, num_motion_keys, have_motion);
   }
 }
-#endif /* WITH_NEW_OBJECT_TYPES */
 
 /* Hair object. */
 void BlenderSync::sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step)
 {
-#ifdef WITH_NEW_OBJECT_TYPES
   /* Convert Blender hair to Cycles curves. */
   BL::Hair b_hair(b_ob.data());
   if (motion) {
@@ -1334,97 +805,70 @@ void BlenderSync::sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motio
   else {
     export_hair_curves(scene, hair, b_hair);
   }
-#else
-  (void)hair;
-  (void)b_ob;
-  (void)motion;
-  (void)motion_step;
-#endif /* WITH_NEW_OBJECT_TYPES */
 }
 
 void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph,
                             BL::Object b_ob,
-                            Geometry *geom,
+                            Hair *hair,
                             const vector<Shader *> &used_shaders)
 {
-  Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL;
-  Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL;
-
   /* Compares curve_keys rather than strands in order to handle quick hair
    * adjustments in dynamic BVH - other methods could probably do this better. */
   array<float3> oldcurve_keys;
   array<float> oldcurve_radius;
-  array<int> oldtriangles;
-  if (hair) {
-    oldcurve_keys.steal_data(hair->curve_keys);
-    oldcurve_radius.steal_data(hair->curve_radius);
-  }
-  else {
-    oldtriangles.steal_data(mesh->triangles);
-  }
+  oldcurve_keys.steal_data(hair->curve_keys);
+  oldcurve_radius.steal_data(hair->curve_radius);
 
-  geom->clear();
-  geom->used_shaders = used_shaders;
+  hair->clear();
+  hair->used_shaders = used_shaders;
 
-  if (view_layer.use_hair && scene->curve_system_manager->use_curves) {
-#ifdef WITH_NEW_OBJECT_TYPES
+  if (view_layer.use_hair) {
     if (b_ob.type() == BL::Object::type_HAIR) {
       /* Hair object. */
       sync_hair(hair, b_ob, false);
-      assert(mesh == NULL);
     }
-    else
-#endif
-    {
+    else {
       /* Particle hair. */
-      bool need_undeformed = geom->need_attribute(scene, ATTR_STD_GENERATED);
+      bool need_undeformed = hair->need_attribute(scene, ATTR_STD_GENERATED);
       BL::Mesh b_mesh = object_to_mesh(
           b_data, b_ob, b_depsgraph, need_undeformed, Mesh::SUBDIVISION_NONE);
 
       if (b_mesh) {
-        sync_particle_hair(geom, b_mesh, b_ob, false);
+        sync_particle_hair(hair, b_mesh, b_ob, false);
         free_object_to_mesh(b_data, b_ob, b_mesh);
       }
     }
   }
 
   /* tag update */
-  const bool rebuild = (hair && ((oldcurve_keys != hair->curve_keys) ||
-                                 (oldcurve_radius != hair->curve_radius))) ||
-                       (mesh && (oldtriangles != mesh->triangles));
+  const bool rebuild = ((oldcurve_keys != hair->curve_keys) ||
+                        (oldcurve_radius != hair->curve_radius));
 
-  geom->tag_update(scene, rebuild);
+  hair->tag_update(scene, rebuild);
 }
 
 void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,
                                    BL::Object b_ob,
-                                   Geometry *geom,
+                                   Hair *hair,
                                    int motion_step)
 {
-  Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL;
-  Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL;
-
   /* Skip if nothing exported. */
-  if ((hair && hair->num_keys() == 0) || (mesh && mesh->verts.size() == 0)) {
+  if (hair->num_keys() == 0) {
     return;
   }
 
   /* Export deformed coordinates. */
   if (ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
-#ifdef WITH_NEW_OBJECT_TYPES
     if (b_ob.type() == BL::Object::type_HAIR) {
       /* Hair object. */
       sync_hair(hair, b_ob, true, motion_step);
-      assert(mesh == NULL);
       return;
     }
-    else
-#endif
-    {
+    else {
       /* Particle hair. */
       BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
       if (b_mesh) {
-        sync_particle_hair(geom, b_mesh, b_ob, true, motion_step);
+        sync_particle_hair(hair, b_mesh, b_ob, true, motion_step);
         free_object_to_mesh(b_data, b_ob, b_mesh);
         return;
       }
@@ -1432,12 +876,7 @@ void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,
   }
 
   /* No deformation on this frame, copy coordinates if other frames did have it. */
-  if (hair) {
-    hair->copy_center_to_motion_step(motion_step);
-  }
-  else {
-    mesh->copy_center_to_motion_step(motion_step);
-  }
+  hair->copy_center_to_motion_step(motion_step);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index ac52948806c..fb9ab9e8c97 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -21,13 +21,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-enum DenoiserType {
-  DENOISER_NONE = 0,
-  DENOISER_OPTIX = 1,
-
-  DENOISER_NUM
-};
-
 enum ComputeDevice {
   COMPUTE_DEVICE_CPU = 0,
   COMPUTE_DEVICE_CUDA = 1,
@@ -120,49 +113,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     }
   }
 
-  /* Ensure there is an OptiX device when using the OptiX denoiser. */
-  bool use_optix_denoising = get_enum(cscene, "preview_denoising", DENOISER_NUM, DENOISER_NONE) ==
-                                 DENOISER_OPTIX &&
-                             !background;
-  BL::Scene::view_layers_iterator b_view_layer;
-  for (b_scene.view_layers.begin(b_view_layer); b_view_layer != b_scene.view_layers.end();
-       ++b_view_layer) {
-    PointerRNA crl = RNA_pointer_get(&b_view_layer->ptr, "cycles");
-    if (get_boolean(crl, "use_optix_denoising")) {
-      use_optix_denoising = true;
-    }
-  }
-
-  if (use_optix_denoising && device.type != DEVICE_OPTIX) {
-    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
-    if (!optix_devices.empty()) {
-      /* Convert to a special multi device with separate denoising devices. */
-      if (device.multi_devices.empty()) {
-        device.multi_devices.push_back(device);
-      }
-
-      /* Try to use the same physical devices for denoising. */
-      for (const DeviceInfo &cuda_device : device.multi_devices) {
-        if (cuda_device.type == DEVICE_CUDA) {
-          for (const DeviceInfo &optix_device : optix_devices) {
-            if (cuda_device.num == optix_device.num) {
-              device.id += optix_device.id;
-              device.denoising_devices.push_back(optix_device);
-              break;
-            }
-          }
-        }
-      }
-
-      if (device.denoising_devices.empty()) {
-        /* Simply use the first available OptiX device. */
-        const DeviceInfo optix_device = optix_devices.front();
-        device.id += optix_device.id; /* Uniquely identify this special multi device. */
-        device.denoising_devices.push_back(optix_device);
-      }
-    }
-  }
-
   return device;
 }
 
diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp
index 7ca35cff961..f7e4623024d 100644
--- a/intern/cycles/blender/blender_geometry.cpp
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -40,17 +40,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
   BL::Material material_override = view_layer.material_override;
   Shader *default_shader = (b_ob.type() == BL::Object::type_VOLUME) ? scene->default_volume :
                                                                       scene->default_surface;
-#ifdef WITH_NEW_OBJECT_TYPES
-  Geometry::Type geom_type = ((b_ob.type() == BL::Object::type_HAIR || use_particle_hair) &&
-                              (scene->curve_system_manager->primitive != CURVE_TRIANGLES)) ?
+  Geometry::Type geom_type = (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) ?
                                  Geometry::HAIR :
                                  Geometry::MESH;
-#else
-  Geometry::Type geom_type = ((use_particle_hair) &&
-                              (scene->curve_system_manager->primitive != CURVE_TRIANGLES)) ?
-                                 Geometry::HAIR :
-                                 Geometry::MESH;
-#endif
 
   /* Find shader indices. */
   vector<Shader *> used_shaders;
@@ -129,12 +121,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
 
   geom->name = ustring(b_ob_data.name().c_str());
 
-#ifdef WITH_NEW_OBJECT_TYPES
   if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) {
-#else
-  if (use_particle_hair) {
-#endif
-    sync_hair(b_depsgraph, b_ob, geom, used_shaders);
+    Hair *hair = static_cast<Hair *>(geom);
+    sync_hair(b_depsgraph, b_ob, hair, used_shaders);
   }
   else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) {
     Mesh *mesh = static_cast<Mesh *>(geom);
@@ -173,12 +162,9 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
     return;
   }
 
-#ifdef WITH_NEW_OBJECT_TYPES
   if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) {
-#else
-  if (use_particle_hair) {
-#endif
-    sync_hair_motion(b_depsgraph, b_ob, geom, motion_step);
+    Hair *hair = static_cast<Hair *>(geom);
+    sync_hair_motion(b_depsgraph, b_ob, hair, motion_step);
   }
   else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) {
     /* No volume motion blur support yet. */
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index a6f380a9ae7..49407799fcd 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -278,25 +278,59 @@ static void mikk_compute_tangents(
   genTangSpaceDefault(&context);
 }
 
+/* Create sculpt vertex color attributes. */
+static void attr_create_sculpt_vertex_color(Scene *scene,
+                                            Mesh *mesh,
+                                            BL::Mesh &b_mesh,
+                                            bool subdivision)
+{
+  BL::Mesh::sculpt_vertex_colors_iterator l;
+
+  for (b_mesh.sculpt_vertex_colors.begin(l); l != b_mesh.sculpt_vertex_colors.end(); ++l) {
+    const bool active_render = l->active_render();
+    AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
+    ustring vcol_name = ustring(l->name().c_str());
+
+    const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
+                           mesh->need_attribute(scene, vcol_std);
+
+    if (!need_vcol) {
+      continue;
+    }
+
+    AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
+    Attribute *vcol_attr = attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_VERTEX);
+    vcol_attr->std = vcol_std;
+
+    float4 *cdata = vcol_attr->data_float4();
+    int numverts = b_mesh.vertices.length();
+
+    for (int i = 0; i < numverts; i++) {
+      *(cdata++) = get_float4(l->data[i].color());
+    }
+  }
+}
+
 /* Create vertex color attributes. */
 static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivision)
 {
-  if (subdivision) {
-    BL::Mesh::vertex_colors_iterator l;
+  BL::Mesh::vertex_colors_iterator l;
 
-    for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
-      const bool active_render = l->active_render();
-      AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
-      ustring vcol_name = ustring(l->name().c_str());
+  for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
+    const bool active_render = l->active_render();
+    AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
+    ustring vcol_name = ustring(l->name().c_str());
 
-      const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
-                             mesh->need_attribute(scene, vcol_std);
+    const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
+                           mesh->need_attribute(scene, vcol_std);
 
-      if (!need_vcol) {
-        continue;
-      }
+    if (!need_vcol) {
+      continue;
+    }
 
-      Attribute *vcol_attr = NULL;
+    Attribute *vcol_attr = NULL;
+
+    if (subdivision) {
       if (active_render) {
         vcol_attr = mesh->subd_attributes.add(vcol_std, vcol_name);
       }
@@ -316,22 +350,7 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh,
         }
       }
     }
-  }
-  else {
-    BL::Mesh::vertex_colors_iterator l;
-    for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
-      const bool active_render = l->active_render();
-      AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
-      ustring vcol_name = ustring(l->name().c_str());
-
-      const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
-                             mesh->need_attribute(scene, vcol_std);
-
-      if (!need_vcol) {
-        continue;
-      }
-
-      Attribute *vcol_attr = NULL;
+    else {
       if (active_render) {
         vcol_attr = mesh->attributes.add(vcol_std, vcol_name);
       }
@@ -828,6 +847,7 @@ static void create_mesh(Scene *scene,
    */
   attr_create_pointiness(scene, mesh, b_mesh, subdivision);
   attr_create_vertex_color(scene, mesh, b_mesh, subdivision);
+  attr_create_sculpt_vertex_color(scene, mesh, b_mesh, subdivision);
   attr_create_random_per_island(scene, mesh, b_mesh, subdivision);
 
   if (subdivision) {
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index c28586d0f63..d3a37563ef4 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -69,11 +69,7 @@ bool BlenderSync::object_is_mesh(BL::Object &b_ob)
 
   BL::Object::type_enum type = b_ob.type();
 
-#ifdef WITH_NEW_OBJECT_TYPES
   if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
-#else
-  if (type == BL::Object::type_VOLUME) {
-#endif
     /* Will be exported attached to mesh. */
     return true;
   }
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 0be19dbffd1..3e595c3ee52 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -31,8 +31,10 @@
 #include "util/util_logging.h"
 #include "util/util_md5.h"
 #include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
 #include "util/util_path.h"
 #include "util/util_string.h"
+#include "util/util_task.h"
 #include "util/util_types.h"
 
 #ifdef WITH_OSL
@@ -1075,5 +1077,14 @@ void *CCL_python_module_init()
   Py_INCREF(Py_False);
 #endif /* WITH_EMBREE */
 
+  if (ccl::openimagedenoise_supported()) {
+    PyModule_AddObject(mod, "with_openimagedenoise", Py_True);
+    Py_INCREF(Py_True);
+  }
+  else {
+    PyModule_AddObject(mod, "with_openimagedenoise", Py_False);
+    Py_INCREF(Py_False);
+  }
+
   return (void *)mod;
 }
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index dbe87ce2b13..391a1b8f473 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -158,7 +158,7 @@ void BlenderSession::create_session()
 
   /* set buffer parameters */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
   session->reset(buffer_params, session_params.samples);
 
   b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -239,8 +239,13 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
   BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_scene, b_render, b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+  BufferParams buffer_params = BlenderSync::get_buffer_params(b_render,
+                                                              b_null_space_view3d,
+                                                              b_null_region_view3d,
+                                                              scene->camera,
+                                                              width,
+                                                              height,
+                                                              session_params.denoising.use);
   session->reset(buffer_params, session_params.samples);
 
   b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -468,14 +473,13 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   session->update_render_tile_cb = function_bind(
       &BlenderSession::update_render_tile, this, _1, _2);
 
+  BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
+
   /* get buffer parameters */
   SessionParams session_params = BlenderSync::get_session_params(
-      b_engine, b_userpref, b_scene, background);
+      b_engine, b_userpref, b_scene, background, b_view_layer);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
-
-  /* render each layer */
-  BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
 
   /* temporary render result to find needed passes and views */
   BL::RenderResult b_rr = begin_render_result(
@@ -485,35 +489,26 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   BL::RenderLayer b_rlay = *b_single_rlay;
   b_rlay_name = b_view_layer.name();
 
-  /* add passes */
-  vector<Pass> passes = sync->sync_render_passes(
-      b_rlay, b_view_layer, session_params.adaptive_sampling);
-  buffer_params.passes = passes;
+  /* Update denoising parameters. */
+  session->set_denoising(session_params.denoising);
 
-  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
-  bool use_denoising = get_boolean(crl, "use_denoising");
-  bool use_optix_denoising = get_boolean(crl, "use_optix_denoising");
-  bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
+  bool use_denoising = session_params.denoising.use;
+  bool store_denoising_passes = session_params.denoising.store_passes;
 
-  buffer_params.denoising_data_pass = use_denoising || write_denoising_passes;
+  buffer_params.denoising_data_pass = use_denoising || store_denoising_passes;
   buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
-  buffer_params.denoising_prefiltered_pass = write_denoising_passes && !use_optix_denoising;
-
-  session->params.run_denoising = use_denoising || write_denoising_passes;
-  session->params.full_denoising = use_denoising && !use_optix_denoising;
-  session->params.optix_denoising = use_denoising && use_optix_denoising;
-  session->params.write_denoising_passes = write_denoising_passes && !use_optix_denoising;
-  session->params.denoising.radius = get_int(crl, "denoising_radius");
-  session->params.denoising.strength = get_float(crl, "denoising_strength");
-  session->params.denoising.feature_strength = get_float(crl, "denoising_feature_strength");
-  session->params.denoising.relative_pca = get_boolean(crl, "denoising_relative_pca");
-  session->params.denoising.optix_input_passes = get_enum(crl, "denoising_optix_input_passes");
-  session->tile_manager.schedule_denoising = session->params.run_denoising;
+  buffer_params.denoising_prefiltered_pass = store_denoising_passes &&
+                                             session_params.denoising.type == DENOISER_NLM;
 
   scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
   scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
   scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass;
 
+  /* Add passes */
+  vector<Pass> passes = sync->sync_render_passes(
+      b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising);
+  buffer_params.passes = passes;
+
   scene->film->pass_alpha_threshold = b_view_layer.pass_alpha_threshold();
   scene->film->tag_passes_update(scene, passes);
   scene->film->tag_update(scene);
@@ -798,7 +793,7 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
 
   /* increase samples, but never decrease */
   session->set_samples(session_params.samples);
-  session->set_denoising_start_sample(session_params.denoising_start_sample);
+  session->set_denoising_start_sample(session_params.denoising.start_sample);
   session->set_pause(session_pause);
 
   /* copy recalc flags, outside of mutex so we can decide to do the real
@@ -831,21 +826,17 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
 
   /* get buffer parameters */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
 
-  if (session_params.device.type != DEVICE_OPTIX &&
-      session_params.device.denoising_devices.empty()) {
-    /* cannot use OptiX denoising when it is not supported by the device. */
-    buffer_params.denoising_data_pass = false;
-  }
-  else {
-    session->set_denoising(buffer_params.denoising_data_pass, true);
+  if (!buffer_params.denoising_data_pass) {
+    session_params.denoising.use = false;
   }
 
+  session->set_denoising(session_params.denoising);
+
+  /* Update film if denoising data was enabled or disabled. */
   if (scene->film->denoising_data_pass != buffer_params.denoising_data_pass) {
     scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
-
-    /* Force a scene and session reset below. */
     scene->film->tag_update(scene);
   }
 
@@ -917,7 +908,7 @@ bool BlenderSession::draw(int w, int h)
       SessionParams session_params = BlenderSync::get_session_params(
           b_engine, b_userpref, b_scene, background);
       BufferParams buffer_params = BlenderSync::get_buffer_params(
-          b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
+          b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
       bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
       if (session_pause == false) {
@@ -935,7 +926,7 @@ bool BlenderSession::draw(int w, int h)
 
   /* draw */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use);
   DeviceDrawParams draw_params;
 
   if (session->params.display_buffer_linear) {
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index f207d8ae07f..19d2730dc93 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -813,6 +813,14 @@ static ShaderNode *add_node(Scene *scene,
     sky->sun_direction = normalize(get_float3(b_sky_node.sun_direction()));
     sky->turbidity = b_sky_node.turbidity();
     sky->ground_albedo = b_sky_node.ground_albedo();
+    sky->sun_disc = b_sky_node.sun_disc();
+    sky->sun_size = b_sky_node.sun_size();
+    sky->sun_elevation = b_sky_node.sun_elevation();
+    sky->sun_rotation = b_sky_node.sun_rotation();
+    sky->altitude = b_sky_node.altitude();
+    sky->air_density = b_sky_node.air_density();
+    sky->dust_density = b_sky_node.dust_density();
+    sky->ozone_density = b_sky_node.ozone_density();
     BL::TexMapping b_texture_mapping(b_sky_node.texture_mapping());
     get_tex_mapping(&sky->tex_mapping, b_texture_mapping);
     node = sky;
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 09813dc8c05..bf065cc5492 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -38,6 +38,7 @@
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -212,7 +213,6 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
   sync_film(b_v3d);
   sync_shaders(b_depsgraph, b_v3d);
   sync_images();
-  sync_curve_settings(b_depsgraph);
 
   geometry_synced.clear(); /* use for objects and motion sync */
 
@@ -538,7 +538,8 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
 
 vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
                                              BL::ViewLayer &b_view_layer,
-                                             bool adaptive_sampling)
+                                             bool adaptive_sampling,
+                                             const DenoiseParams &denoising)
 {
   vector<Pass> passes;
 
@@ -555,16 +556,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
       Pass::add(pass_type, passes, b_pass.name().c_str());
   }
 
-  PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles");
-  bool use_denoising = get_boolean(crp, "use_denoising");
-  bool use_optix_denoising = get_boolean(crp, "use_optix_denoising");
-  bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
   scene->film->denoising_flags = 0;
-  if (use_denoising || write_denoising_passes) {
-    if (!use_optix_denoising) {
+  if (denoising.use || denoising.store_passes) {
+    if (denoising.type == DENOISER_NLM) {
 #define MAP_OPTION(name, flag) \
-  if (!get_boolean(crp, name)) \
+  if (!get_boolean(crl, name)) \
     scene->film->denoising_flags |= flag;
       MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
       MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
@@ -577,11 +575,11 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
     b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
   }
 
-  if (write_denoising_passes) {
+  if (denoising.store_passes) {
     b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
     b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
     b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
-    if (!use_optix_denoising) {
+    if (denoising.type == DENOISER_NLM) {
       b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
       b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
       b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
@@ -593,46 +591,46 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
   }
 
 #ifdef __KERNEL_DEBUG__
-  if (get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
+  if (get_boolean(crl, "pass_debug_bvh_traversed_nodes")) {
     b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_BVH_TRAVERSED_NODES, passes, "Debug BVH Traversed Nodes");
   }
-  if (get_boolean(crp, "pass_debug_bvh_traversed_instances")) {
+  if (get_boolean(crl, "pass_debug_bvh_traversed_instances")) {
     b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes, "Debug BVH Traversed Instances");
   }
-  if (get_boolean(crp, "pass_debug_bvh_intersections")) {
+  if (get_boolean(crl, "pass_debug_bvh_intersections")) {
     b_engine.add_pass("Debug BVH Intersections", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_BVH_INTERSECTIONS, passes, "Debug BVH Intersections");
   }
-  if (get_boolean(crp, "pass_debug_ray_bounces")) {
+  if (get_boolean(crl, "pass_debug_ray_bounces")) {
     b_engine.add_pass("Debug Ray Bounces", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_RAY_BOUNCES, passes, "Debug Ray Bounces");
   }
 #endif
-  if (get_boolean(crp, "pass_debug_render_time")) {
+  if (get_boolean(crl, "pass_debug_render_time")) {
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
   }
-  if (get_boolean(crp, "pass_debug_sample_count")) {
+  if (get_boolean(crl, "pass_debug_sample_count")) {
     b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
   }
-  if (get_boolean(crp, "use_pass_volume_direct")) {
+  if (get_boolean(crl, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
     Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
   }
-  if (get_boolean(crp, "use_pass_volume_indirect")) {
+  if (get_boolean(crl, "use_pass_volume_indirect")) {
     b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
     Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd");
   }
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
    * User facing parameter is the number of pairs. */
-  int crypto_depth = divide_up(min(16, get_int(crp, "pass_crypto_depth")), 2);
+  int crypto_depth = divide_up(min(16, get_int(crl, "pass_crypto_depth")), 2);
   scene->film->cryptomatte_depth = crypto_depth;
   scene->film->cryptomatte_passes = CRYPT_NONE;
-  if (get_boolean(crp, "use_pass_crypto_object")) {
+  if (get_boolean(crl, "use_pass_crypto_object")) {
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Object%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
@@ -641,7 +639,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_OBJECT);
   }
-  if (get_boolean(crp, "use_pass_crypto_material")) {
+  if (get_boolean(crl, "use_pass_crypto_material")) {
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Material%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
@@ -650,7 +648,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_MATERIAL);
   }
-  if (get_boolean(crp, "use_pass_crypto_asset")) {
+  if (get_boolean(crl, "use_pass_crypto_asset")) {
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
@@ -659,19 +657,19 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_ASSET);
   }
-  if (get_boolean(crp, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) {
+  if (get_boolean(crl, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) {
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_ACCURATE);
   }
 
   if (adaptive_sampling) {
     Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
-    if (!get_boolean(crp, "pass_debug_sample_count")) {
+    if (!get_boolean(crl, "pass_debug_sample_count")) {
       Pass::add(PASS_SAMPLE_COUNT, passes);
     }
   }
 
-  RNA_BEGIN (&crp, b_aov, "aovs") {
+  RNA_BEGIN (&crl, b_aov, "aovs") {
     bool is_color = (get_enum(b_aov, "type") == 1);
     string name = get_string(b_aov, "name");
 
@@ -732,6 +730,11 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
   params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
   params.num_bvh_time_steps = RNA_int_get(&cscene, "debug_bvh_time_steps");
 
+  PointerRNA csscene = RNA_pointer_get(&b_scene.ptr, "cycles_curves");
+  params.hair_subdivisions = get_int(csscene, "subdivisions");
+  params.hair_shape = (CurveShapeType)get_enum(
+      csscene, "shape", CURVE_NUM_SHAPE_TYPES, CURVE_THICK);
+
   if (background && params.shadingsystem != SHADINGSYSTEM_OSL)
     params.persistent_data = r.use_persistent_data();
   else
@@ -751,20 +754,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
     params.texture_limit = 0;
   }
 
-  /* TODO(sergey): Once OSL supports per-microarchitecture optimization get
-   * rid of this.
-   */
-  if (params.shadingsystem == SHADINGSYSTEM_OSL) {
-    params.bvh_layout = BVH_LAYOUT_BVH4;
-  }
-  else {
-    params.bvh_layout = DebugFlags().cpu.bvh_layout;
-  }
-
-#ifdef WITH_EMBREE
-  params.bvh_layout = RNA_boolean_get(&cscene, "use_bvh_embree") ? BVH_LAYOUT_EMBREE :
-                                                                   params.bvh_layout;
-#endif
+  params.bvh_layout = DebugFlags().cpu.bvh_layout;
 
   params.background = background;
 
@@ -782,7 +772,8 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background)
 SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
                                               BL::Preferences &b_preferences,
                                               BL::Scene &b_scene,
-                                              bool background)
+                                              bool background,
+                                              BL::ViewLayer b_view_layer)
 {
   SessionParams params;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -860,9 +851,22 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
     params.tile_order = TILE_BOTTOM_TO_TOP;
   }
 
-  /* other parameters */
+  /* Denoising */
+  params.denoising = get_denoise_params(b_scene, b_view_layer, background);
+
+  if (params.denoising.use) {
+    /* Add additional denoising devices if we are rendering and denoising
+     * with different devices. */
+    params.device.add_denoising_devices(params.denoising.type);
+
+    /* Check if denoiser is supported by device. */
+    if (!(params.device.denoisers & params.denoising.type)) {
+      params.denoising.use = false;
+    }
+  }
+
+  /* Viewport Performance */
   params.start_resolution = get_int(cscene, "preview_start_resolution");
-  params.denoising_start_sample = get_int(cscene, "preview_denoising_start_sample");
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
   /* other parameters */
@@ -915,4 +919,55 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   return params;
 }
 
+DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
+                                              BL::ViewLayer &b_view_layer,
+                                              bool background)
+{
+  DenoiseParams denoising;
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+  if (background) {
+    /* Final Render Denoising */
+    denoising.use = get_boolean(cscene, "use_denoising");
+    denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE);
+
+    if (b_view_layer) {
+      PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles");
+      if (!get_boolean(clayer, "use_denoising")) {
+        denoising.use = false;
+      }
+
+      denoising.radius = get_int(clayer, "denoising_radius");
+      denoising.strength = get_float(clayer, "denoising_strength");
+      denoising.feature_strength = get_float(clayer, "denoising_feature_strength");
+      denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca");
+      denoising.optix_input_passes = get_enum(clayer, "denoising_optix_input_passes");
+
+      denoising.store_passes = get_boolean(clayer, "denoising_store_passes");
+    }
+  }
+  else {
+    /* Viewport Denoising */
+    denoising.use = get_boolean(cscene, "use_preview_denoising");
+    denoising.type = (DenoiserType)get_enum(
+        cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.start_sample = get_int(cscene, "preview_denoising_start_sample");
+
+    /* Auto select fastest denoiser. */
+    if (denoising.type == DENOISER_NONE) {
+      if (!Device::available_devices(DEVICE_MASK_OPTIX).empty()) {
+        denoising.type = DENOISER_OPTIX;
+      }
+      else if (openimagedenoise_supported()) {
+        denoising.type = DENOISER_OPENIMAGEDENOISE;
+      }
+      else {
+        denoising.use = false;
+      }
+    }
+  }
+
+  return denoising;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 341281b18ee..0214d9eb3b8 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -75,7 +75,8 @@ class BlenderSync {
   void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
   vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
                                   BL::ViewLayer &b_view_layer,
-                                  bool adaptive_sampling);
+                                  bool adaptive_sampling,
+                                  const DenoiseParams &denoising);
   void sync_integrator();
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
@@ -94,23 +95,29 @@ class BlenderSync {
 
   /* get parameters */
   static SceneParams get_scene_params(BL::Scene &b_scene, bool background);
-  static SessionParams get_session_params(BL::RenderEngine &b_engine,
-                                          BL::Preferences &b_userpref,
-                                          BL::Scene &b_scene,
-                                          bool background);
+  static SessionParams get_session_params(
+      BL::RenderEngine &b_engine,
+      BL::Preferences &b_userpref,
+      BL::Scene &b_scene,
+      bool background,
+      BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL));
   static bool get_session_pause(BL::Scene &b_scene, bool background);
-  static BufferParams get_buffer_params(BL::Scene &b_scene,
-                                        BL::RenderSettings &b_render,
+  static BufferParams get_buffer_params(BL::RenderSettings &b_render,
                                         BL::SpaceView3D &b_v3d,
                                         BL::RegionView3D &b_rv3d,
                                         Camera *cam,
                                         int width,
-                                        int height);
+                                        int height,
+                                        const bool use_denoiser);
 
   static PassType get_pass_type(BL::RenderPass &b_pass);
   static int get_denoising_pass(BL::RenderPass &b_pass);
 
  private:
+  static DenoiseParams get_denoise_params(BL::Scene &b_scene,
+                                          BL::ViewLayer &b_view_layer,
+                                          bool background);
+
   /* sync */
   void sync_lights(BL::Depsgraph &b_depsgraph, bool update_all);
   void sync_materials(BL::Depsgraph &b_depsgraph, bool update_all);
@@ -153,16 +160,12 @@ class BlenderSync {
   /* Hair */
   void sync_hair(BL::Depsgraph b_depsgraph,
                  BL::Object b_ob,
-                 Geometry *geom,
+                 Hair *hair,
                  const vector<Shader *> &used_shaders);
-  void sync_hair_motion(BL::Depsgraph b_depsgraph,
-                        BL::Object b_ob,
-                        Geometry *geom,
-                        int motion_step);
+  void sync_hair_motion(BL::Depsgraph b_depsgraph, BL::Object b_ob, Hair *hair, int motion_step);
   void sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step = 0);
   void sync_particle_hair(
-      Geometry *geom, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0);
-  void sync_curve_settings(BL::Depsgraph &b_depsgraph);
+      Hair *hair, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0);
   bool object_has_particle_hair(BL::Object b_ob);
 
   /* Camera */
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
index 93e84e28032..73ef5f94720 100644
--- a/intern/cycles/blender/blender_viewport.cpp
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -61,17 +61,6 @@ const bool BlenderViewportParameters::custom_viewport_parameters() const
   return !(use_scene_world && use_scene_lights);
 }
 
-bool BlenderViewportParameters::get_viewport_display_denoising(BL::SpaceView3D &b_v3d,
-                                                               BL::Scene &b_scene)
-{
-  bool use_denoising = false;
-  if (b_v3d) {
-    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-    use_denoising = get_enum(cscene, "preview_denoising") != 0;
-  }
-  return use_denoising;
-}
-
 PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceView3D &b_v3d)
 {
   PassType display_pass = PASS_NONE;
@@ -83,11 +72,6 @@ PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceVi
   return display_pass;
 }
 
-bool update_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene)
-{
-  return BlenderViewportParameters::get_viewport_display_denoising(b_v3d, b_scene);
-}
-
 PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
 {
   if (b_v3d) {
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
index 3e44e552f1d..7c6c9c4d274 100644
--- a/intern/cycles/blender/blender_viewport.h
+++ b/intern/cycles/blender/blender_viewport.h
@@ -44,15 +44,11 @@ class BlenderViewportParameters {
   friend class BlenderSync;
 
  public:
-  /* Get whether to enable denoising data pass in viewport. */
-  static bool get_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene);
   /* Retrieve the render pass that needs to be displayed on the given `SpaceView3D`
    * When the `b_v3d` parameter is not given `PASS_NONE` will be returned. */
   static PassType get_viewport_display_render_pass(BL::SpaceView3D &b_v3d);
 };
 
-bool update_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene);
-
 PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_volume.cpp b/intern/cycles/blender/blender_volume.cpp
index 4eed6be8c7c..80591e0eec8 100644
--- a/intern/cycles/blender/blender_volume.cpp
+++ b/intern/cycles/blender/blender_volume.cpp
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
 class BlenderSmokeLoader : public ImageLoader {
  public:
   BlenderSmokeLoader(BL::Object &b_ob, AttributeStandard attribute)
-      : b_domain(object_fluid_gas_domain_find(b_ob)), b_mesh(b_ob.data()), attribute(attribute)
+      : b_domain(object_fluid_gas_domain_find(b_ob)), attribute(attribute)
   {
+    BL::Mesh b_mesh(b_ob.data());
+    mesh_texture_space(b_mesh, texspace_loc, texspace_size);
   }
 
   bool load_metadata(ImageMetaData &metadata) override
@@ -77,9 +79,7 @@ class BlenderSmokeLoader : public ImageLoader {
     /* Create a matrix to transform from object space to mesh texture space.
      * This does not work with deformations but that can probably only be done
      * well with a volume grid mapping of coordinates. */
-    float3 loc, size;
-    mesh_texture_space(b_mesh, loc, size);
-    metadata.transform_3d = transform_translate(-loc) * transform_scale(size);
+    metadata.transform_3d = transform_translate(-texspace_loc) * transform_scale(texspace_size);
     metadata.use_transform_3d = true;
 
     return true;
@@ -177,7 +177,7 @@ class BlenderSmokeLoader : public ImageLoader {
   }
 
   BL::FluidDomainSettings b_domain;
-  BL::Mesh b_mesh;
+  float3 texspace_loc, texspace_size;
   AttributeStandard attribute;
 };
 
@@ -216,25 +216,16 @@ static void sync_smoke_volume(Scene *scene, BL::Object &b_ob, Mesh *mesh, float
 
 class BlenderVolumeLoader : public VDBImageLoader {
  public:
-  BlenderVolumeLoader(BL::Volume b_volume, const string &grid_name)
-      : VDBImageLoader(grid_name),
-        b_volume(b_volume),
-        b_volume_grid(PointerRNA_NULL),
-        unload(false)
+  BlenderVolumeLoader(BL::BlendData &b_data, BL::Volume &b_volume, const string &grid_name)
+      : VDBImageLoader(grid_name), b_data(b_data), b_volume(b_volume), unload(false)
   {
-#ifdef WITH_OPENVDB
-    /* Find grid with matching name. */
-    BL::Volume::grids_iterator b_grid_iter;
-    for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) {
-      if (b_grid_iter->name() == grid_name) {
-        b_volume_grid = *b_grid_iter;
-      }
-    }
-#endif
   }
 
   bool load_metadata(ImageMetaData &metadata) override
   {
+    b_volume.grids.load(b_data.ptr.data);
+    BL::VolumeGrid b_volume_grid = find_grid();
+
     if (!b_volume_grid) {
       return false;
     }
@@ -255,6 +246,9 @@ class BlenderVolumeLoader : public VDBImageLoader {
                    const size_t pixel_size,
                    const bool associate_alpha) override
   {
+    b_volume.grids.load(b_data.ptr.data);
+    BL::VolumeGrid b_volume_grid = find_grid();
+
     if (!b_volume_grid) {
       return false;
     }
@@ -266,19 +260,38 @@ class BlenderVolumeLoader : public VDBImageLoader {
   {
     /* TODO: detect multiple volume datablocks with the same filepath. */
     const BlenderVolumeLoader &other_loader = (const BlenderVolumeLoader &)other;
-    return b_volume == other_loader.b_volume && b_volume_grid == other_loader.b_volume_grid;
+    return b_volume == other_loader.b_volume && grid_name == other_loader.grid_name;
   }
 
   void cleanup() override
   {
     VDBImageLoader::cleanup();
+
+    BL::VolumeGrid b_volume_grid = find_grid();
     if (b_volume_grid && unload) {
       b_volume_grid.unload();
     }
   }
 
+  /* Find grid with matching name. Grid point not stored in the class since
+   * grids may be unloaded before we load the pixels, for example for motion
+   * blur where we move between frames. */
+  BL::VolumeGrid find_grid()
+  {
+#ifdef WITH_OPENVDB
+    BL::Volume::grids_iterator b_grid_iter;
+    for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) {
+      if (b_grid_iter->name() == grid_name) {
+        return *b_grid_iter;
+      }
+    }
+#endif
+
+    return BL::VolumeGrid(PointerRNA_NULL);
+  }
+
+  BL::BlendData b_data;
   BL::Volume b_volume;
-  BL::VolumeGrid b_volume_grid;
   bool unload;
 };
 
@@ -325,7 +338,7 @@ static void sync_volume_object(BL::BlendData &b_data, BL::Object &b_ob, Scene *s
                             mesh->attributes.add(std) :
                             mesh->attributes.add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL);
 
-      ImageLoader *loader = new BlenderVolumeLoader(b_volume, name.string());
+      ImageLoader *loader = new BlenderVolumeLoader(b_data, b_volume, name.string());
       ImageParams params;
       params.frame = b_volume.grids.frame();
 
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index fb724704a84..8b8f3ca7265 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -9,8 +9,6 @@ set(INC_SYS
 set(SRC
   bvh.cpp
   bvh2.cpp
-  bvh4.cpp
-  bvh8.cpp
   bvh_binning.cpp
   bvh_build.cpp
   bvh_embree.cpp
@@ -24,8 +22,6 @@ set(SRC
 set(SRC_HEADERS
   bvh.h
   bvh2.h
-  bvh4.h
-  bvh8.h
   bvh_binning.h
   bvh_build.h
   bvh_embree.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 0313bcd68b0..e9e67fd1305 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -22,17 +22,10 @@
 #include "render/object.h"
 
 #include "bvh/bvh2.h"
-#include "bvh/bvh4.h"
-#include "bvh/bvh8.h"
 #include "bvh/bvh_build.h"
+#include "bvh/bvh_embree.h"
 #include "bvh/bvh_node.h"
-
-#ifdef WITH_OPTIX
-#  include "bvh/bvh_optix.h"
-#endif
-#ifdef WITH_EMBREE
-#  include "bvh/bvh_embree.h"
-#endif
+#include "bvh/bvh_optix.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
@@ -47,10 +40,6 @@ const char *bvh_layout_name(BVHLayout layout)
   switch (layout) {
     case BVH_LAYOUT_BVH2:
       return "BVH2";
-    case BVH_LAYOUT_BVH4:
-      return "BVH4";
-    case BVH_LAYOUT_BVH8:
-      return "BVH8";
     case BVH_LAYOUT_NONE:
       return "NONE";
     case BVH_LAYOUT_EMBREE:
@@ -114,10 +103,6 @@ BVH *BVH::create(const BVHParams &params,
   switch (params.bvh_layout) {
     case BVH_LAYOUT_BVH2:
       return new BVH2(params, geometry, objects);
-    case BVH_LAYOUT_BVH4:
-      return new BVH4(params, geometry, objects);
-    case BVH_LAYOUT_BVH8:
-      return new BVH8(params, geometry, objects);
     case BVH_LAYOUT_EMBREE:
 #ifdef WITH_EMBREE
       return new BVHEmbree(params, geometry, objects);
@@ -337,13 +322,6 @@ void BVH::pack_primitives()
 
 void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 {
-  /* The BVH's for instances are built separately, but for traversal all
-   * BVH's are stored in global arrays. This function merges them into the
-   * top level BVH, adjusting indexes and offsets where appropriate.
-   */
-  const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
-  const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
-
   /* Adjust primitive index to point to the triangle in the global array, for
    * geometry with transform applied and already in the top level BVH.
    */
@@ -506,53 +484,21 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
       for (size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
         size_t nsize, nsize_bbox;
         if (bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
-          if (use_obvh) {
-            nsize = BVH_UNALIGNED_ONODE_SIZE;
-            nsize_bbox = BVH_UNALIGNED_ONODE_SIZE - 1;
-          }
-          else {
-            nsize = use_qbvh ? BVH_UNALIGNED_QNODE_SIZE : BVH_UNALIGNED_NODE_SIZE;
-            nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE - 1 : 0;
-          }
+          nsize = BVH_UNALIGNED_NODE_SIZE;
+          nsize_bbox = 0;
         }
         else {
-          if (use_obvh) {
-            nsize = BVH_ONODE_SIZE;
-            nsize_bbox = BVH_ONODE_SIZE - 1;
-          }
-          else {
-            nsize = (use_qbvh) ? BVH_QNODE_SIZE : BVH_NODE_SIZE;
-            nsize_bbox = (use_qbvh) ? BVH_QNODE_SIZE - 1 : 0;
-          }
+          nsize = BVH_NODE_SIZE;
+          nsize_bbox = 0;
         }
 
         memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox * sizeof(int4));
 
         /* Modify offsets into arrays */
         int4 data = bvh_nodes[i + nsize_bbox];
-
-        if (use_obvh) {
-          int4 data1 = bvh_nodes[i + nsize_bbox - 1];
-          data.z += (data.z < 0) ? -noffset_leaf : noffset;
-          data.w += (data.w < 0) ? -noffset_leaf : noffset;
-          data.x += (data.x < 0) ? -noffset_leaf : noffset;
-          data.y += (data.y < 0) ? -noffset_leaf : noffset;
-          data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
-          data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
-          data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
-          data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
-          pack_nodes[pack_nodes_offset + nsize_bbox] = data;
-          pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
-        }
-        else {
-          data.z += (data.z < 0) ? -noffset_leaf : noffset;
-          data.w += (data.w < 0) ? -noffset_leaf : noffset;
-          if (use_qbvh) {
-            data.x += (data.x < 0) ? -noffset_leaf : noffset;
-            data.y += (data.y < 0) ? -noffset_leaf : noffset;
-          }
-          pack_nodes[pack_nodes_offset + nsize_bbox] = data;
-        }
+        data.z += (data.z < 0) ? -noffset_leaf : noffset;
+        data.w += (data.w < 0) ? -noffset_leaf : noffset;
+        pack_nodes[pack_nodes_offset + nsize_bbox] = data;
 
         /* Usually this copies nothing, but we better
          * be prepared for possible node size extension.
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index bdde38640c9..6639e06b0bc 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -76,7 +76,7 @@ struct PackedBVH {
   }
 };
 
-enum BVH_TYPE { bvh2, bvh4, bvh8 };
+enum BVH_TYPE { bvh2 };
 
 /* BVH */
 
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
deleted file mode 100644
index 143c3e54f94..00000000000
--- a/intern/cycles/bvh/bvh4.cpp
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Adapted from code copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bvh/bvh4.h"
-
-#include "render/mesh.h"
-#include "render/object.h"
-
-#include "bvh/bvh_node.h"
-#include "bvh/bvh_unaligned.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Can we avoid this somehow or make more generic?
- *
- * Perhaps we can merge nodes in actual tree and make our
- * life easier all over the place.
- */
-
-BVH4::BVH4(const BVHParams &params_,
-           const vector<Geometry *> &geometry_,
-           const vector<Object *> &objects_)
-    : BVH(params_, geometry_, objects_)
-{
-  params.bvh_layout = BVH_LAYOUT_BVH4;
-}
-
-namespace {
-
-BVHNode *bvh_node_merge_children_recursively(const BVHNode *node)
-{
-  if (node->is_leaf()) {
-    return new LeafNode(*reinterpret_cast<const LeafNode *>(node));
-  }
-  /* Collect nodes of one layer deeper, allowing us to have more children in an inner layer. */
-  assert(node->num_children() <= 2);
-  const BVHNode *children[4];
-  const BVHNode *child0 = node->get_child(0);
-  const BVHNode *child1 = node->get_child(1);
-  int num_children = 0;
-  if (child0->is_leaf()) {
-    children[num_children++] = child0;
-  }
-  else {
-    children[num_children++] = child0->get_child(0);
-    children[num_children++] = child0->get_child(1);
-  }
-  if (child1->is_leaf()) {
-    children[num_children++] = child1;
-  }
-  else {
-    children[num_children++] = child1->get_child(0);
-    children[num_children++] = child1->get_child(1);
-  }
-  /* Merge children in subtrees. */
-  BVHNode *children4[4];
-  for (int i = 0; i < num_children; ++i) {
-    children4[i] = bvh_node_merge_children_recursively(children[i]);
-  }
-  /* Allocate new node. */
-  BVHNode *node4 = new InnerNode(node->bounds, children4, num_children);
-  /* TODO(sergey): Consider doing this from the InnerNode() constructor.
-   * But in order to do this nicely need to think of how to pass all the
-   * parameters there. */
-  if (node->is_unaligned) {
-    node4->is_unaligned = true;
-    node4->aligned_space = new Transform();
-    *node4->aligned_space = *node->aligned_space;
-  }
-  return node4;
-}
-
-}  // namespace
-
-BVHNode *BVH4::widen_children_nodes(const BVHNode *root)
-{
-  if (root == NULL) {
-    return NULL;
-  }
-  if (root->is_leaf()) {
-    return const_cast<BVHNode *>(root);
-  }
-  BVHNode *root4 = bvh_node_merge_children_recursively(root);
-  /* TODO(sergey): Pack children nodes to parents which has less that 4
-   * children. */
-  return root4;
-}
-
-void BVH4::pack_leaf(const BVHStackEntry &e, const LeafNode *leaf)
-{
-  float4 data[BVH_QNODE_LEAF_SIZE];
-  memset(data, 0, sizeof(data));
-  if (leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
-    /* object */
-    data[0].x = __int_as_float(~(leaf->lo));
-    data[0].y = __int_as_float(0);
-  }
-  else {
-    /* triangle */
-    data[0].x = __int_as_float(leaf->lo);
-    data[0].y = __int_as_float(leaf->hi);
-  }
-  data[0].z = __uint_as_float(leaf->visibility);
-  if (leaf->num_triangles() != 0) {
-    data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
-  }
-
-  memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4) * BVH_QNODE_LEAF_SIZE);
-}
-
-void BVH4::pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  bool has_unaligned = false;
-  /* Check whether we have to create unaligned node or all nodes are aligned
-   * and we can cut some corner here.
-   */
-  if (params.use_unaligned_nodes) {
-    for (int i = 0; i < num; i++) {
-      if (en[i].node->is_unaligned) {
-        has_unaligned = true;
-        break;
-      }
-    }
-  }
-  if (has_unaligned) {
-    /* There's no unaligned children, pack into AABB node. */
-    pack_unaligned_inner(e, en, num);
-  }
-  else {
-    /* Create unaligned node with orientation transform for each of the
-     * children.
-     */
-    pack_aligned_inner(e, en, num);
-  }
-}
-
-void BVH4::pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  BoundBox bounds[4];
-  int child[4];
-  for (int i = 0; i < num; ++i) {
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_aligned_node(
-      e.idx, bounds, child, e.node->visibility, e.node->time_from, e.node->time_to, num);
-}
-
-void BVH4::pack_aligned_node(int idx,
-                             const BoundBox *bounds,
-                             const int *child,
-                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
-                             const int num)
-{
-  float4 data[BVH_QNODE_SIZE];
-  memset(data, 0, sizeof(data));
-
-  data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-  data[0].y = time_from;
-  data[0].z = time_to;
-
-  for (int i = 0; i < num; i++) {
-    float3 bb_min = bounds[i].min;
-    float3 bb_max = bounds[i].max;
-
-    data[1][i] = bb_min.x;
-    data[2][i] = bb_max.x;
-    data[3][i] = bb_min.y;
-    data[4][i] = bb_max.y;
-    data[5][i] = bb_min.z;
-    data[6][i] = bb_max.z;
-
-    data[7][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 4; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-    data[1][i] = FLT_MAX;
-    data[2][i] = -FLT_MAX;
-
-    data[3][i] = FLT_MAX;
-    data[4][i] = -FLT_MAX;
-
-    data[5][i] = FLT_MAX;
-    data[6][i] = -FLT_MAX;
-
-    data[7][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_QNODE_SIZE);
-}
-
-void BVH4::pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  Transform aligned_space[4];
-  BoundBox bounds[4];
-  int child[4];
-  for (int i = 0; i < num; ++i) {
-    aligned_space[i] = en[i].node->get_aligned_space();
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_unaligned_node(e.idx,
-                      aligned_space,
-                      bounds,
-                      child,
-                      e.node->visibility,
-                      e.node->time_from,
-                      e.node->time_to,
-                      num);
-}
-
-void BVH4::pack_unaligned_node(int idx,
-                               const Transform *aligned_space,
-                               const BoundBox *bounds,
-                               const int *child,
-                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
-                               const int num)
-{
-  float4 data[BVH_UNALIGNED_QNODE_SIZE];
-  memset(data, 0, sizeof(data));
-
-  data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-  data[0].y = time_from;
-  data[0].z = time_to;
-
-  for (int i = 0; i < num; i++) {
-    Transform space = BVHUnaligned::compute_node_transform(bounds[i], aligned_space[i]);
-
-    data[1][i] = space.x.x;
-    data[2][i] = space.x.y;
-    data[3][i] = space.x.z;
-
-    data[4][i] = space.y.x;
-    data[5][i] = space.y.y;
-    data[6][i] = space.y.z;
-
-    data[7][i] = space.z.x;
-    data[8][i] = space.z.y;
-    data[9][i] = space.z.z;
-
-    data[10][i] = space.x.w;
-    data[11][i] = space.y.w;
-    data[12][i] = space.z.w;
-
-    data[13][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 4; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-
-    data[1][i] = NAN;
-    data[2][i] = NAN;
-    data[3][i] = NAN;
-
-    data[4][i] = NAN;
-    data[5][i] = NAN;
-    data[6][i] = NAN;
-
-    data[7][i] = NAN;
-    data[8][i] = NAN;
-    data[9][i] = NAN;
-
-    data[10][i] = NAN;
-    data[11][i] = NAN;
-    data[12][i] = NAN;
-
-    data[13][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_UNALIGNED_QNODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void BVH4::pack_nodes(const BVHNode *root)
-{
-  /* Calculate size of the arrays required. */
-  const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-  const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-  assert(num_leaf_nodes <= num_nodes);
-  const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-  size_t node_size;
-  if (params.use_unaligned_nodes) {
-    const size_t num_unaligned_nodes = root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
-    node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
-                (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
-  }
-  else {
-    node_size = num_inner_nodes * BVH_QNODE_SIZE;
-  }
-  /* Resize arrays. */
-  pack.nodes.clear();
-  pack.leaf_nodes.clear();
-  /* For top level BVH, first merge existing BVH's so we know the offsets. */
-  if (params.top_level) {
-    pack_instances(node_size, num_leaf_nodes * BVH_QNODE_LEAF_SIZE);
-  }
-  else {
-    pack.nodes.resize(node_size);
-    pack.leaf_nodes.resize(num_leaf_nodes * BVH_QNODE_LEAF_SIZE);
-  }
-
-  int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-  vector<BVHStackEntry> stack;
-  stack.reserve(BVHParams::MAX_DEPTH * 2);
-  if (root->is_leaf()) {
-    stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-  }
-  else {
-    stack.push_back(BVHStackEntry(root, nextNodeIdx));
-    nextNodeIdx += root->has_unaligned() ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE;
-  }
-
-  while (stack.size()) {
-    BVHStackEntry e = stack.back();
-    stack.pop_back();
-
-    if (e.node->is_leaf()) {
-      /* leaf node */
-      const LeafNode *leaf = reinterpret_cast<const LeafNode *>(e.node);
-      pack_leaf(e, leaf);
-    }
-    else {
-      /* Inner node. */
-      /* Collect nodes. */
-      const BVHNode *children[4];
-      const int num_children = e.node->num_children();
-      /* Push entries on the stack. */
-      for (int i = 0; i < num_children; ++i) {
-        int idx;
-        children[i] = e.node->get_child(i);
-        assert(children[i] != NULL);
-        if (children[i]->is_leaf()) {
-          idx = nextLeafNodeIdx++;
-        }
-        else {
-          idx = nextNodeIdx;
-          nextNodeIdx += children[i]->has_unaligned() ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE;
-        }
-        stack.push_back(BVHStackEntry(children[i], idx));
-      }
-      /* Set node. */
-      pack_inner(e, &stack[stack.size() - num_children], num_children);
-    }
-  }
-
-  assert(node_size == nextNodeIdx);
-  /* Root index to start traversal at, to handle case of single leaf node. */
-  pack.root_index = (root->is_leaf()) ? -1 : 0;
-}
-
-void BVH4::refit_nodes()
-{
-  assert(!params.top_level);
-
-  BoundBox bbox = BoundBox::empty;
-  uint visibility = 0;
-  refit_node(0, (pack.root_index == -1) ? true : false, bbox, visibility);
-}
-
-void BVH4::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility)
-{
-  if (leaf) {
-    /* Refit leaf node. */
-    int4 *data = &pack.leaf_nodes[idx];
-    int4 c = data[0];
-
-    BVH::refit_primitives(c.x, c.y, bbox, visibility);
-
-    /* TODO(sergey): This is actually a copy of pack_leaf(),
-     * but this chunk of code only knows actual data and has
-     * no idea about BVHNode.
-     *
-     * Would be nice to de-duplicate code, but trying to make
-     * making code more general ends up in much nastier code
-     * in my opinion so far.
-     *
-     * Same applies to the inner nodes case below.
-     */
-    float4 leaf_data[BVH_QNODE_LEAF_SIZE];
-    leaf_data[0].x = __int_as_float(c.x);
-    leaf_data[0].y = __int_as_float(c.y);
-    leaf_data[0].z = __uint_as_float(visibility);
-    leaf_data[0].w = __uint_as_float(c.w);
-    memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4) * BVH_QNODE_LEAF_SIZE);
-  }
-  else {
-    int4 *data = &pack.nodes[idx];
-    bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
-    int4 c;
-    if (is_unaligned) {
-      c = data[13];
-    }
-    else {
-      c = data[7];
-    }
-    /* Refit inner node, set bbox from children. */
-    BoundBox child_bbox[4] = {BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty};
-    uint child_visibility[4] = {0};
-    int num_nodes = 0;
-
-    for (int i = 0; i < 4; ++i) {
-      if (c[i] != 0) {
-        refit_node((c[i] < 0) ? -c[i] - 1 : c[i], (c[i] < 0), child_bbox[i], child_visibility[i]);
-        ++num_nodes;
-        bbox.grow(child_bbox[i]);
-        visibility |= child_visibility[i];
-      }
-    }
-
-    if (is_unaligned) {
-      Transform aligned_space[4] = {
-          transform_identity(), transform_identity(), transform_identity(), transform_identity()};
-      pack_unaligned_node(
-          idx, aligned_space, child_bbox, &c[0], visibility, 0.0f, 1.0f, num_nodes);
-    }
-    else {
-      pack_aligned_node(idx, child_bbox, &c[0], visibility, 0.0f, 1.0f, num_nodes);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
deleted file mode 100644
index afbb9007afb..00000000000
--- a/intern/cycles/bvh/bvh4.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Adapted from code copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __BVH4_H__
-#define __BVH4_H__
-
-#include "bvh/bvh.h"
-#include "bvh/bvh_params.h"
-
-#include "util/util_types.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class BVHNode;
-struct BVHStackEntry;
-class BVHParams;
-class BoundBox;
-class LeafNode;
-class Object;
-class Progress;
-
-#define BVH_QNODE_SIZE 8
-#define BVH_QNODE_LEAF_SIZE 1
-#define BVH_UNALIGNED_QNODE_SIZE 14
-
-/* BVH4
- *
- * Quad BVH, with each node having four children, to use with SIMD instructions.
- */
-class BVH4 : public BVH {
- protected:
-  /* constructor */
-  friend class BVH;
-  BVH4(const BVHParams &params,
-       const vector<Geometry *> &geometry,
-       const vector<Object *> &objects);
-
-  /* Building process. */
-  virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
-
-  /* pack */
-  void pack_nodes(const BVHNode *root) override;
-
-  void pack_leaf(const BVHStackEntry &e, const LeafNode *leaf);
-  void pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-
-  void pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_aligned_node(int idx,
-                         const BoundBox *bounds,
-                         const int *child,
-                         const uint visibility,
-                         const float time_from,
-                         const float time_to,
-                         const int num);
-
-  void pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_unaligned_node(int idx,
-                           const Transform *aligned_space,
-                           const BoundBox *bounds,
-                           const int *child,
-                           const uint visibility,
-                           const float time_from,
-                           const float time_to,
-                           const int num);
-
-  /* refit */
-  void refit_nodes() override;
-  void refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __BVH4_H__ */
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
deleted file mode 100644
index b805865b2c8..00000000000
--- a/intern/cycles/bvh/bvh8.cpp
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Original code Copyright 2017, Intel Corporation
- * Modifications Copyright 2018, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "bvh/bvh8.h"
-
-#include "render/hair.h"
-#include "render/mesh.h"
-#include "render/object.h"
-
-#include "bvh/bvh_node.h"
-#include "bvh/bvh_unaligned.h"
-
-CCL_NAMESPACE_BEGIN
-
-BVH8::BVH8(const BVHParams &params_,
-           const vector<Geometry *> &geometry_,
-           const vector<Object *> &objects_)
-    : BVH(params_, geometry_, objects_)
-{
-}
-
-namespace {
-
-BVHNode *bvh_node_merge_children_recursively(const BVHNode *node)
-{
-  if (node->is_leaf()) {
-    return new LeafNode(*reinterpret_cast<const LeafNode *>(node));
-  }
-  /* Collect nodes of two layer deeper, allowing us to have more childrem in
-   * an inner layer. */
-  assert(node->num_children() <= 2);
-  const BVHNode *children[8];
-  const BVHNode *child0 = node->get_child(0);
-  const BVHNode *child1 = node->get_child(1);
-  int num_children = 0;
-  if (child0->is_leaf()) {
-    children[num_children++] = child0;
-  }
-  else {
-    const BVHNode *child00 = child0->get_child(0), *child01 = child0->get_child(1);
-    if (child00->is_leaf()) {
-      children[num_children++] = child00;
-    }
-    else {
-      children[num_children++] = child00->get_child(0);
-      children[num_children++] = child00->get_child(1);
-    }
-    if (child01->is_leaf()) {
-      children[num_children++] = child01;
-    }
-    else {
-      children[num_children++] = child01->get_child(0);
-      children[num_children++] = child01->get_child(1);
-    }
-  }
-  if (child1->is_leaf()) {
-    children[num_children++] = child1;
-  }
-  else {
-    const BVHNode *child10 = child1->get_child(0), *child11 = child1->get_child(1);
-    if (child10->is_leaf()) {
-      children[num_children++] = child10;
-    }
-    else {
-      children[num_children++] = child10->get_child(0);
-      children[num_children++] = child10->get_child(1);
-    }
-    if (child11->is_leaf()) {
-      children[num_children++] = child11;
-    }
-    else {
-      children[num_children++] = child11->get_child(0);
-      children[num_children++] = child11->get_child(1);
-    }
-  }
-  /* Merge children in subtrees. */
-  BVHNode *children4[8];
-  for (int i = 0; i < num_children; ++i) {
-    children4[i] = bvh_node_merge_children_recursively(children[i]);
-  }
-  /* Allocate new node. */
-  BVHNode *node8 = new InnerNode(node->bounds, children4, num_children);
-  /* TODO(sergey): Consider doing this from the InnerNode() constructor.
-   * But in order to do this nicely need to think of how to pass all the
-   * parameters there. */
-  if (node->is_unaligned) {
-    node8->is_unaligned = true;
-    node8->aligned_space = new Transform();
-    *node8->aligned_space = *node->aligned_space;
-  }
-  return node8;
-}
-
-}  // namespace
-
-BVHNode *BVH8::widen_children_nodes(const BVHNode *root)
-{
-  if (root == NULL) {
-    return NULL;
-  }
-  if (root->is_leaf()) {
-    return const_cast<BVHNode *>(root);
-  }
-  BVHNode *root8 = bvh_node_merge_children_recursively(root);
-  /* TODO(sergey): Pack children nodes to parents which has less that 4
-   * children. */
-  return root8;
-}
-
-void BVH8::pack_leaf(const BVHStackEntry &e, const LeafNode *leaf)
-{
-  float4 data[BVH_ONODE_LEAF_SIZE];
-  memset(data, 0, sizeof(data));
-  if (leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
-    /* object */
-    data[0].x = __int_as_float(~(leaf->lo));
-    data[0].y = __int_as_float(0);
-  }
-  else {
-    /* triangle */
-    data[0].x = __int_as_float(leaf->lo);
-    data[0].y = __int_as_float(leaf->hi);
-  }
-  data[0].z = __uint_as_float(leaf->visibility);
-  if (leaf->num_triangles() != 0) {
-    data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
-  }
-
-  memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4) * BVH_ONODE_LEAF_SIZE);
-}
-
-void BVH8::pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  bool has_unaligned = false;
-  /* Check whether we have to create unaligned node or all nodes are aligned
-   * and we can cut some corner here.
-   */
-  if (params.use_unaligned_nodes) {
-    for (int i = 0; i < num; i++) {
-      if (en[i].node->is_unaligned) {
-        has_unaligned = true;
-        break;
-      }
-    }
-  }
-  if (has_unaligned) {
-    /* There's no unaligned children, pack into AABB node. */
-    pack_unaligned_inner(e, en, num);
-  }
-  else {
-    /* Create unaligned node with orientation transform for each of the
-     * children.
-     */
-    pack_aligned_inner(e, en, num);
-  }
-}
-
-void BVH8::pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  BoundBox bounds[8];
-  int child[8];
-  for (int i = 0; i < num; ++i) {
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_aligned_node(
-      e.idx, bounds, child, e.node->visibility, e.node->time_from, e.node->time_to, num);
-}
-
-void BVH8::pack_aligned_node(int idx,
-                             const BoundBox *bounds,
-                             const int *child,
-                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
-                             const int num)
-{
-  float8 data[8];
-  memset(data, 0, sizeof(data));
-
-  data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-  data[0].b = time_from;
-  data[0].c = time_to;
-
-  for (int i = 0; i < num; i++) {
-    float3 bb_min = bounds[i].min;
-    float3 bb_max = bounds[i].max;
-
-    data[1][i] = bb_min.x;
-    data[2][i] = bb_max.x;
-    data[3][i] = bb_min.y;
-    data[4][i] = bb_max.y;
-    data[5][i] = bb_min.z;
-    data[6][i] = bb_max.z;
-
-    data[7][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 8; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-    data[1][i] = FLT_MAX;
-    data[2][i] = -FLT_MAX;
-
-    data[3][i] = FLT_MAX;
-    data[4][i] = -FLT_MAX;
-
-    data[5][i] = FLT_MAX;
-    data[6][i] = -FLT_MAX;
-
-    data[7][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_ONODE_SIZE);
-}
-
-void BVH8::pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  Transform aligned_space[8];
-  BoundBox bounds[8];
-  int child[8];
-  for (int i = 0; i < num; ++i) {
-    aligned_space[i] = en[i].node->get_aligned_space();
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_unaligned_node(e.idx,
-                      aligned_space,
-                      bounds,
-                      child,
-                      e.node->visibility,
-                      e.node->time_from,
-                      e.node->time_to,
-                      num);
-}
-
-void BVH8::pack_unaligned_node(int idx,
-                               const Transform *aligned_space,
-                               const BoundBox *bounds,
-                               const int *child,
-                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
-                               const int num)
-{
-  float8 data[BVH_UNALIGNED_ONODE_SIZE];
-  memset(data, 0, sizeof(data));
-
-  data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-  data[0].b = time_from;
-  data[0].c = time_to;
-
-  for (int i = 0; i < num; i++) {
-    Transform space = BVHUnaligned::compute_node_transform(bounds[i], aligned_space[i]);
-
-    data[1][i] = space.x.x;
-    data[2][i] = space.x.y;
-    data[3][i] = space.x.z;
-
-    data[4][i] = space.y.x;
-    data[5][i] = space.y.y;
-    data[6][i] = space.y.z;
-
-    data[7][i] = space.z.x;
-    data[8][i] = space.z.y;
-    data[9][i] = space.z.z;
-
-    data[10][i] = space.x.w;
-    data[11][i] = space.y.w;
-    data[12][i] = space.z.w;
-
-    data[13][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 8; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-
-    data[1][i] = NAN;
-    data[2][i] = NAN;
-    data[3][i] = NAN;
-
-    data[4][i] = NAN;
-    data[5][i] = NAN;
-    data[6][i] = NAN;
-
-    data[7][i] = NAN;
-    data[8][i] = NAN;
-    data[9][i] = NAN;
-
-    data[10][i] = NAN;
-    data[11][i] = NAN;
-    data[12][i] = NAN;
-
-    data[13][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_UNALIGNED_ONODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void BVH8::pack_nodes(const BVHNode *root)
-{
-  /* Calculate size of the arrays required. */
-  const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-  const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-  assert(num_leaf_nodes <= num_nodes);
-  const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-  size_t node_size;
-  if (params.use_unaligned_nodes) {
-    const size_t num_unaligned_nodes = root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
-    node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
-                (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
-  }
-  else {
-    node_size = num_inner_nodes * BVH_ONODE_SIZE;
-  }
-  /* Resize arrays. */
-  pack.nodes.clear();
-  pack.leaf_nodes.clear();
-  /* For top level BVH, first merge existing BVH's so we know the offsets. */
-  if (params.top_level) {
-    pack_instances(node_size, num_leaf_nodes * BVH_ONODE_LEAF_SIZE);
-  }
-  else {
-    pack.nodes.resize(node_size);
-    pack.leaf_nodes.resize(num_leaf_nodes * BVH_ONODE_LEAF_SIZE);
-  }
-
-  int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-  vector<BVHStackEntry> stack;
-  stack.reserve(BVHParams::MAX_DEPTH * 2);
-  if (root->is_leaf()) {
-    stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-  }
-  else {
-    stack.push_back(BVHStackEntry(root, nextNodeIdx));
-    nextNodeIdx += root->has_unaligned() ? BVH_UNALIGNED_ONODE_SIZE : BVH_ONODE_SIZE;
-  }
-
-  while (stack.size()) {
-    BVHStackEntry e = stack.back();
-    stack.pop_back();
-
-    if (e.node->is_leaf()) {
-      /* leaf node */
-      const LeafNode *leaf = reinterpret_cast<const LeafNode *>(e.node);
-      pack_leaf(e, leaf);
-    }
-    else {
-      /* Inner node. */
-      /* Collect nodes. */
-      const BVHNode *children[8];
-      int num_children = e.node->num_children();
-      /* Push entries on the stack. */
-      for (int i = 0; i < num_children; ++i) {
-        int idx;
-        children[i] = e.node->get_child(i);
-        if (children[i]->is_leaf()) {
-          idx = nextLeafNodeIdx++;
-        }
-        else {
-          idx = nextNodeIdx;
-          nextNodeIdx += children[i]->has_unaligned() ? BVH_UNALIGNED_ONODE_SIZE : BVH_ONODE_SIZE;
-        }
-        stack.push_back(BVHStackEntry(children[i], idx));
-      }
-      /* Set node. */
-      pack_inner(e, &stack[stack.size() - num_children], num_children);
-    }
-  }
-
-  assert(node_size == nextNodeIdx);
-  /* Root index to start traversal at, to handle case of single leaf node. */
-  pack.root_index = (root->is_leaf()) ? -1 : 0;
-}
-
-void BVH8::refit_nodes()
-{
-  assert(!params.top_level);
-
-  BoundBox bbox = BoundBox::empty;
-  uint visibility = 0;
-  refit_node(0, (pack.root_index == -1) ? true : false, bbox, visibility);
-}
-
-void BVH8::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility)
-{
-  if (leaf) {
-    int4 *data = &pack.leaf_nodes[idx];
-    int4 c = data[0];
-    /* Refit leaf node. */
-    for (int prim = c.x; prim < c.y; prim++) {
-      int pidx = pack.prim_index[prim];
-      int tob = pack.prim_object[prim];
-      Object *ob = objects[tob];
-
-      if (pidx == -1) {
-        /* Object instance. */
-        bbox.grow(ob->bounds);
-      }
-      else {
-        /* Primitives. */
-        if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
-          /* Curves. */
-          const Hair *hair = static_cast<const Hair *>(ob->geometry);
-          int prim_offset = (params.top_level) ? hair->prim_offset : 0;
-          Hair::Curve curve = hair->get_curve(pidx - prim_offset);
-          int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
-          curve.bounds_grow(k, &hair->curve_keys[0], &hair->curve_radius[0], bbox);
-
-          /* Motion curves. */
-          if (hair->use_motion_blur) {
-            Attribute *attr = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-            if (attr) {
-              size_t hair_size = hair->curve_keys.size();
-              size_t steps = hair->motion_steps - 1;
-              float3 *key_steps = attr->data_float3();
-
-              for (size_t i = 0; i < steps; i++) {
-                curve.bounds_grow(k, key_steps + i * hair_size, &hair->curve_radius[0], bbox);
-              }
-            }
-          }
-        }
-        else {
-          /* Triangles. */
-          const Mesh *mesh = static_cast<const Mesh *>(ob->geometry);
-          int prim_offset = (params.top_level) ? mesh->prim_offset : 0;
-          Mesh::Triangle triangle = mesh->get_triangle(pidx - prim_offset);
-          const float3 *vpos = &mesh->verts[0];
-
-          triangle.bounds_grow(vpos, bbox);
-
-          /* Motion triangles. */
-          if (mesh->use_motion_blur) {
-            Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-            if (attr) {
-              size_t mesh_size = mesh->verts.size();
-              size_t steps = mesh->motion_steps - 1;
-              float3 *vert_steps = attr->data_float3();
-
-              for (size_t i = 0; i < steps; i++) {
-                triangle.bounds_grow(vert_steps + i * mesh_size, bbox);
-              }
-            }
-          }
-        }
-      }
-
-      visibility |= ob->visibility;
-    }
-
-    float4 leaf_data[BVH_ONODE_LEAF_SIZE];
-    leaf_data[0].x = __int_as_float(c.x);
-    leaf_data[0].y = __int_as_float(c.y);
-    leaf_data[0].z = __uint_as_float(visibility);
-    leaf_data[0].w = __uint_as_float(c.w);
-    memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4) * BVH_ONODE_LEAF_SIZE);
-  }
-  else {
-    float8 *data = (float8 *)&pack.nodes[idx];
-    bool is_unaligned = (__float_as_uint(data[0].a) & PATH_RAY_NODE_UNALIGNED) != 0;
-    /* Refit inner node, set bbox from children. */
-    BoundBox child_bbox[8] = {BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty};
-    int child[8];
-    uint child_visibility[8] = {0};
-    int num_nodes = 0;
-
-    for (int i = 0; i < 8; ++i) {
-      child[i] = __float_as_int(data[(is_unaligned) ? 13 : 7][i]);
-
-      if (child[i] != 0) {
-        refit_node((child[i] < 0) ? -child[i] - 1 : child[i],
-                   (child[i] < 0),
-                   child_bbox[i],
-                   child_visibility[i]);
-        ++num_nodes;
-        bbox.grow(child_bbox[i]);
-        visibility |= child_visibility[i];
-      }
-    }
-
-    if (is_unaligned) {
-      Transform aligned_space[8] = {transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity()};
-      pack_unaligned_node(
-          idx, aligned_space, child_bbox, child, visibility, 0.0f, 1.0f, num_nodes);
-    }
-    else {
-      pack_aligned_node(idx, child_bbox, child, visibility, 0.0f, 1.0f, num_nodes);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
deleted file mode 100644
index d23fa528e3e..00000000000
--- a/intern/cycles/bvh/bvh8.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Original code Copyright 2017, Intel Corporation
- * Modifications Copyright 2018, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BVH8_H__
-#define __BVH8_H__
-
-#include "bvh/bvh.h"
-#include "bvh/bvh_params.h"
-
-#include "util/util_types.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class BVHNode;
-struct BVHStackEntry;
-class BVHParams;
-class BoundBox;
-class LeafNode;
-class Object;
-class Progress;
-
-#define BVH_ONODE_SIZE 16
-#define BVH_ONODE_LEAF_SIZE 1
-#define BVH_UNALIGNED_ONODE_SIZE 28
-
-/* BVH8
- *
- * Octo BVH, with each node having eight children, to use with SIMD instructions.
- */
-class BVH8 : public BVH {
- protected:
-  /* constructor */
-  friend class BVH;
-  BVH8(const BVHParams &params,
-       const vector<Geometry *> &geometry,
-       const vector<Object *> &objects);
-
-  /* Building process. */
-  virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
-
-  /* pack */
-  void pack_nodes(const BVHNode *root) override;
-
-  void pack_leaf(const BVHStackEntry &e, const LeafNode *leaf);
-  void pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-
-  void pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_aligned_node(int idx,
-                         const BoundBox *bounds,
-                         const int *child,
-                         const uint visibility,
-                         const float time_from,
-                         const float time_to,
-                         const int num);
-
-  void pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_unaligned_node(int idx,
-                           const Transform *aligned_space,
-                           const BoundBox *bounds,
-                           const int *child,
-                           const uint visibility,
-                           const float time_from,
-                           const float time_to,
-                           const int num);
-
-  /* refit */
-  void refit_nodes() override;
-  void refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __BVH8_H__ */
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 814b5ced5d2..86ab7b00815 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -39,48 +39,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* BVH Build Task */
-
-class BVHBuildTask : public Task {
- public:
-  BVHBuildTask(
-      BVHBuild *build, InnerNode *node, int child, const BVHObjectBinning &range, int level)
-      : range_(range)
-  {
-    run = function_bind(&BVHBuild::thread_build_node, build, node, child, &range_, level);
-  }
-
- private:
-  BVHObjectBinning range_;
-};
-
-class BVHSpatialSplitBuildTask : public Task {
- public:
-  BVHSpatialSplitBuildTask(BVHBuild *build,
-                           InnerNode *node,
-                           int child,
-                           const BVHRange &range,
-                           const vector<BVHReference> &references,
-                           int level)
-      : range_(range),
-        references_(references.begin() + range.start(), references.begin() + range.end())
-  {
-    range_.set_start(0);
-    run = function_bind(&BVHBuild::thread_build_spatial_split_node,
-                        build,
-                        node,
-                        child,
-                        &range_,
-                        &references_,
-                        level,
-                        _1);
-  }
-
- private:
-  BVHRange range_;
-  vector<BVHReference> references_;
-};
-
 /* Constructor / Destructor */
 
 BVHBuild::BVHBuild(const vector<Object *> &objects_,
@@ -201,6 +159,13 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
   if (hair->has_motion_blur()) {
     curve_attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
   }
+
+  const PrimitiveType primitive_type =
+      (curve_attr_mP != NULL) ?
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON :
+                                                 PRIMITIVE_MOTION_CURVE_THICK) :
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : PRIMITIVE_CURVE_THICK);
+
   const size_t num_curves = hair->num_curves();
   for (uint j = 0; j < num_curves; j++) {
     const Hair::Curve curve = hair->get_curve(j);
@@ -211,7 +176,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
         BoundBox bounds = BoundBox::empty;
         curve.bounds_grow(k, &hair->curve_keys[0], curve_radius, bounds);
         if (bounds.valid()) {
-          int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k);
+          int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
           references.push_back(BVHReference(bounds, j, i, packed_type));
           root.grow(bounds);
           center.grow(bounds.center2());
@@ -232,7 +197,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
           curve.bounds_grow(k, key_steps + step * num_keys, curve_radius, bounds);
         }
         if (bounds.valid()) {
-          int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
+          int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
           references.push_back(BVHReference(bounds, j, i, packed_type));
           root.grow(bounds);
           center.grow(bounds.center2());
@@ -288,7 +253,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
           bounds.grow(curr_bounds);
           if (bounds.valid()) {
             const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
-            int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
+            int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
             references.push_back(BVHReference(bounds, j, i, packed_type, prev_time, curr_time));
             root.grow(bounds);
             center.grow(bounds.center2());
@@ -423,22 +388,6 @@ BVHNode *BVHBuild::run()
   }
 
   spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
-  if (params.use_spatial_split) {
-    /* NOTE: The API here tries to be as much ready for multi-threaded build
-     * as possible, but at the same time it tries not to introduce any
-     * changes in behavior for until all refactoring needed for threading is
-     * finished.
-     *
-     * So we currently allocate single storage for now, which is only used by
-     * the only thread working on the spatial BVH build.
-     */
-    spatial_storage.resize(TaskScheduler::num_threads() + 1);
-    size_t num_bins = max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1;
-    foreach (BVHSpatialStorage &storage, spatial_storage) {
-      storage.right_bounds.clear();
-    }
-    spatial_storage[0].right_bounds.resize(num_bins);
-  }
   spatial_free_index = 0;
 
   need_prim_time = params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0;
@@ -465,7 +414,8 @@ BVHNode *BVHBuild::run()
 
   if (params.use_spatial_split) {
     /* Perform multithreaded spatial split build. */
-    rootnode = build_node(root, &references, 0, 0);
+    BVHSpatialStorage *local_storage = &spatial_storage.local();
+    rootnode = build_node(root, references, 0, local_storage);
     task_pool.wait_work();
   }
   else {
@@ -475,6 +425,9 @@ BVHNode *BVHBuild::run()
     task_pool.wait_work();
   }
 
+  /* clean up temporary memory usage by threads */
+  spatial_storage.clear();
+
   /* delete if we canceled */
   if (rootnode) {
     if (progress.get_cancel()) {
@@ -529,41 +482,46 @@ void BVHBuild::progress_update()
   progress_start_time = time_dt();
 }
 
-void BVHBuild::thread_build_node(InnerNode *inner, int child, BVHObjectBinning *range, int level)
+void BVHBuild::thread_build_node(InnerNode *inner,
+                                 int child,
+                                 const BVHObjectBinning &range,
+                                 int level)
 {
   if (progress.get_cancel())
     return;
 
   /* build nodes */
-  BVHNode *node = build_node(*range, level);
+  BVHNode *node = build_node(range, level);
 
   /* set child in inner node */
   inner->children[child] = node;
 
   /* update progress */
-  if (range->size() < THREAD_TASK_SIZE) {
+  if (range.size() < THREAD_TASK_SIZE) {
     /*rotate(node, INT_MAX, 5);*/
 
     thread_scoped_lock lock(build_mutex);
 
-    progress_count += range->size();
+    progress_count += range.size();
     progress_update();
   }
 }
 
 void BVHBuild::thread_build_spatial_split_node(InnerNode *inner,
                                                int child,
-                                               BVHRange *range,
-                                               vector<BVHReference> *references,
-                                               int level,
-                                               int thread_id)
+                                               const BVHRange &range,
+                                               vector<BVHReference> &references,
+                                               int level)
 {
   if (progress.get_cancel()) {
     return;
   }
 
+  /* Get per-thread memory for spatial split. */
+  BVHSpatialStorage *local_storage = &spatial_storage.local();
+
   /* build nodes */
-  BVHNode *node = build_node(*range, references, level, thread_id);
+  BVHNode *node = build_node(range, references, level, local_storage);
 
   /* set child in inner node */
   inner->children[child] = node;
@@ -586,14 +544,22 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange &range,
   for (int i = 0; i < size; i++) {
     const BVHReference &ref = references[range.start() + i];
 
-    if (ref.prim_type() & PRIMITIVE_CURVE)
-      num_curves++;
-    if (ref.prim_type() & PRIMITIVE_MOTION_CURVE)
-      num_motion_curves++;
-    else if (ref.prim_type() & PRIMITIVE_TRIANGLE)
-      num_triangles++;
-    else if (ref.prim_type() & PRIMITIVE_MOTION_TRIANGLE)
-      num_motion_triangles++;
+    if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
+        num_motion_curves++;
+      }
+      else {
+        num_curves++;
+      }
+    }
+    else if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
+        num_motion_triangles++;
+      }
+      else {
+        num_triangles++;
+      }
+    }
   }
 
   return (num_triangles <= params.max_triangle_leaf_size) &&
@@ -675,8 +641,8 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level)
     /* Threaded build */
     inner = new InnerNode(bounds);
 
-    task_pool.push(new BVHBuildTask(this, inner, 0, left, level + 1), true);
-    task_pool.push(new BVHBuildTask(this, inner, 1, right, level + 1), true);
+    task_pool.push([=] { thread_build_node(inner, 0, left, level + 1); });
+    task_pool.push([=] { thread_build_node(inner, 1, right, level + 1); });
   }
 
   if (do_unalinged_split) {
@@ -688,9 +654,9 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level)
 
 /* multithreaded spatial split builder */
 BVHNode *BVHBuild::build_node(const BVHRange &range,
-                              vector<BVHReference> *references,
+                              vector<BVHReference> &references,
                               int level,
-                              int thread_id)
+                              BVHSpatialStorage *storage)
 {
   /* Update progress.
    *
@@ -707,18 +673,17 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
   if (!(range.size() > 0 && params.top_level && level == 0)) {
     if (params.small_enough_for_leaf(range.size(), level)) {
       progress_count += range.size();
-      return create_leaf_node(range, *references);
+      return create_leaf_node(range, references);
     }
   }
 
   /* Perform splitting test. */
-  BVHSpatialStorage *storage = &spatial_storage[thread_id];
   BVHMixedSplit split(this, storage, range, references, level);
 
   if (!(range.size() > 0 && params.top_level && level == 0)) {
     if (split.no_split) {
       progress_count += range.size();
-      return create_leaf_node(range, *references);
+      return create_leaf_node(range, references);
     }
   }
   float leafSAH = params.sah_primitive_cost * split.leafSAH;
@@ -731,7 +696,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
   Transform aligned_space;
   bool do_unalinged_split = false;
   if (params.use_unaligned_nodes && splitSAH > params.unaligned_split_threshold * leafSAH) {
-    aligned_space = unaligned_heuristic.compute_aligned_space(range, &references->at(0));
+    aligned_space = unaligned_heuristic.compute_aligned_space(range, &references.at(0));
     unaligned_split = BVHMixedSplit(
         this, storage, range, references, level, &unaligned_heuristic, &aligned_space);
     /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */
@@ -757,8 +722,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
 
   BoundBox bounds;
   if (do_unalinged_split) {
-    bounds = unaligned_heuristic.compute_aligned_boundbox(
-        range, &references->at(0), aligned_space);
+    bounds = unaligned_heuristic.compute_aligned_boundbox(range, &references.at(0), aligned_space);
   }
   else {
     bounds = range.bounds();
@@ -770,24 +734,35 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
     /* Local build. */
 
     /* Build left node. */
-    vector<BVHReference> copy(references->begin() + right.start(),
-                              references->begin() + right.end());
+    vector<BVHReference> right_references(references.begin() + right.start(),
+                                          references.begin() + right.end());
     right.set_start(0);
 
-    BVHNode *leftnode = build_node(left, references, level + 1, thread_id);
+    BVHNode *leftnode = build_node(left, references, level + 1, storage);
 
     /* Build right node. */
-    BVHNode *rightnode = build_node(right, &copy, level + 1, thread_id);
+    BVHNode *rightnode = build_node(right, right_references, level + 1, storage);
 
     inner = new InnerNode(bounds, leftnode, rightnode);
   }
   else {
     /* Threaded build. */
     inner = new InnerNode(bounds);
-    task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 0, left, *references, level + 1),
-                   true);
-    task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 1, right, *references, level + 1),
-                   true);
+
+    vector<BVHReference> left_references(references.begin() + left.start(),
+                                         references.begin() + left.end());
+    vector<BVHReference> right_references(references.begin() + right.start(),
+                                          references.begin() + right.end());
+    right.set_start(0);
+
+    /* Create tasks for left and right nodes, using copy for most arguments and
+     * move for reference to avoid memory copies. */
+    task_pool.push([=, refs = std::move(left_references)]() mutable {
+      thread_build_spatial_split_node(inner, 0, left, refs, level + 1);
+    });
+    task_pool.push([=, refs = std::move(right_references)]() mutable {
+      thread_build_spatial_split_node(inner, 1, right, refs, level + 1);
+    });
   }
 
   if (do_unalinged_split) {
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 3fe4c3799e2..c35af083fbd 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -74,9 +74,9 @@ class BVHBuild {
 
   /* Building. */
   BVHNode *build_node(const BVHRange &range,
-                      vector<BVHReference> *references,
+                      vector<BVHReference> &references,
                       int level,
-                      int thread_id);
+                      BVHSpatialStorage *storage);
   BVHNode *build_node(const BVHObjectBinning &range, int level);
   BVHNode *create_leaf_node(const BVHRange &range, const vector<BVHReference> &references);
   BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num);
@@ -86,13 +86,12 @@ class BVHBuild {
 
   /* Threads. */
   enum { THREAD_TASK_SIZE = 4096 };
-  void thread_build_node(InnerNode *node, int child, BVHObjectBinning *range, int level);
+  void thread_build_node(InnerNode *node, int child, const BVHObjectBinning &range, int level);
   void thread_build_spatial_split_node(InnerNode *node,
                                        int child,
-                                       BVHRange *range,
-                                       vector<BVHReference> *references,
-                                       int level,
-                                       int thread_id);
+                                       const BVHRange &range,
+                                       vector<BVHReference> &references,
+                                       int level);
   thread_mutex build_mutex;
 
   /* Progress. */
@@ -127,7 +126,7 @@ class BVHBuild {
 
   /* Spatial splitting. */
   float spatial_min_overlap;
-  vector<BVHSpatialStorage> spatial_storage;
+  enumerable_thread_specific<BVHSpatialStorage> spatial_storage;
   size_t spatial_free_index;
   thread_spin_lock spatial_spin_lock;
 
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 6735202835b..17e1f86a589 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -47,9 +47,11 @@
 #  include "render/hair.h"
 #  include "render/mesh.h"
 #  include "render/object.h"
+
 #  include "util/util_foreach.h"
 #  include "util/util_logging.h"
 #  include "util/util_progress.h"
+#  include "util/util_stats.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -65,30 +67,9 @@ static_assert(Object::MAX_MOTION_STEPS == Geometry::MAX_MOTION_STEPS,
  * as well as filtering for volume objects happen here.
  * Cycles' own BVH does that directly inside the traversal calls.
  */
-static void rtc_filter_func(const RTCFilterFunctionNArguments *args)
-{
-  /* Current implementation in Cycles assumes only single-ray intersection queries. */
-  assert(args->N == 1);
-
-  const RTCRay *ray = (RTCRay *)args->ray;
-  const RTCHit *hit = (RTCHit *)args->hit;
-  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  KernelGlobals *kg = ctx->kg;
-
-  /* Check if there is backfacing hair to ignore. */
-  if (IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) {
-    if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
-            make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
-      *args->valid = 0;
-      return;
-    }
-  }
-}
-
 static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
 {
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
   assert(args->N == 1);
 
   const RTCRay *ray = (RTCRay *)args->ray;
@@ -96,17 +77,6 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
   KernelGlobals *kg = ctx->kg;
 
-  /* For all ray types: Check if there is backfacing hair to ignore */
-  if (IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) {
-    if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
-            make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
-      *args->valid = 0;
-      return;
-    }
-  }
-
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL: {
       /* Append the intersection to the end of the array. */
@@ -168,7 +138,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
       }
 
       /* Ignore curves. */
-      if (hit->geomID & 1) {
+      if (IS_HAIR(hit->geomID)) {
         /* This tells Embree to continue tracing. */
         *args->valid = 0;
         break;
@@ -249,6 +219,34 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   }
 }
 
+static void rtc_filter_func_thick_curve(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore backfacing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+}
+
+static void rtc_filter_occluded_func_thick_curve(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore backfacing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  rtc_filter_occluded_func(args);
+}
+
 static size_t unaccounted_mem = 0;
 
 static bool rtc_memory_monitor_func(void *userPtr, const ssize_t bytes, const bool)
@@ -326,8 +324,6 @@ BVHEmbree::BVHEmbree(const BVHParams &params_,
       stats(NULL),
       curve_subdivisions(params.curve_subdivisions),
       build_quality(RTC_BUILD_QUALITY_REFIT),
-      use_curves(params_.curve_flags & CURVE_KN_INTERPOLATE),
-      use_ribbons(params.curve_flags & CURVE_KN_RIBBONS),
       dynamic_scene(true)
 {
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
@@ -653,7 +649,6 @@ void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
   }
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
-  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
   rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
   rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
@@ -724,9 +719,7 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair
 
   /* Catmull-Rom splines need extra CVs at the beginning and end of each curve. */
   size_t num_keys_embree = num_keys;
-  if (use_curves) {
-    num_keys_embree += num_curves * 2;
-  }
+  num_keys_embree += num_curves * 2;
 
   /* Copy the CV data to Embree */
   const int t_mid = (num_motion_steps - 1) / 2;
@@ -746,45 +739,22 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair
 
     assert(rtc_verts);
     if (rtc_verts) {
-      if (use_curves) {
-        const size_t num_curves = hair->num_curves();
-        for (size_t j = 0; j < num_curves; ++j) {
-          Hair::Curve c = hair->get_curve(j);
-          int fk = c.first_key;
-          int k = 1;
-          for (; k < c.num_keys + 1; ++k, ++fk) {
-            rtc_verts[k] = float3_to_float4(verts[fk]);
-            rtc_verts[k].w = curve_radius[fk];
-          }
-          /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */
-          rtc_verts[0] = rtc_verts[1];
-          rtc_verts[k] = rtc_verts[k - 1];
-          rtc_verts += c.num_keys + 2;
-        }
-      }
-      else {
-        for (size_t j = 0; j < num_keys_embree; ++j) {
-          rtc_verts[j] = float3_to_float4(verts[j]);
-          rtc_verts[j].w = curve_radius[j];
+      const size_t num_curves = hair->num_curves();
+      for (size_t j = 0; j < num_curves; ++j) {
+        Hair::Curve c = hair->get_curve(j);
+        int fk = c.first_key;
+        int k = 1;
+        for (; k < c.num_keys + 1; ++k, ++fk) {
+          rtc_verts[k] = float3_to_float4(verts[fk]);
+          rtc_verts[k].w = curve_radius[fk];
         }
+        /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */
+        rtc_verts[0] = rtc_verts[1];
+        rtc_verts[k] = rtc_verts[k - 1];
+        rtc_verts += c.num_keys + 2;
       }
     }
   }
-#  if RTC_VERSION >= 30900
-  if (!use_curves) {
-    unsigned char *flags = (unsigned char *)rtcSetNewGeometryBuffer(geom_id,
-                                                                    RTC_BUFFER_TYPE_FLAGS,
-                                                                    0,
-                                                                    RTC_FORMAT_UCHAR,
-                                                                    sizeof(unsigned char),
-                                                                    num_keys_embree);
-    flags[0] = RTC_CURVE_FLAG_NEIGHBOR_RIGHT;
-    ::memset(flags + 1,
-             RTC_CURVE_FLAG_NEIGHBOR_RIGHT | RTC_CURVE_FLAG_NEIGHBOR_RIGHT,
-             num_keys_embree - 2);
-    flags[num_keys_embree - 1] = RTC_CURVE_FLAG_NEIGHBOR_LEFT;
-  }
-#  endif
 }
 
 void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
@@ -800,6 +770,12 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
   }
 
   const size_t num_motion_steps = min(num_geometry_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  const PrimitiveType primitive_type =
+      (num_motion_steps > 1) ?
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON :
+                                                 PRIMITIVE_MOTION_CURVE_THICK) :
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : PRIMITIVE_CURVE_THICK);
+
   assert(num_geometry_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
 
   const size_t num_curves = hair->num_curves();
@@ -820,21 +796,12 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
   size_t prim_tri_index_size = pack.prim_index.size();
   pack.prim_tri_index.resize(prim_tri_index_size + num_segments);
 
-#  if RTC_VERSION >= 30900
-  enum RTCGeometryType type = (!use_curves) ?
-                                  (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE :
-                                                 RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE) :
-                                  (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE :
-                                                 RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE);
-#  else
-  enum RTCGeometryType type = (!use_curves) ?
-                                  RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE :
-                                  (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE :
-                                                 RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE);
-#  endif
+  enum RTCGeometryType type = (hair->curve_shape == CURVE_RIBBON ?
+                                   RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE :
+                                   RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE);
 
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type);
-  rtcSetGeometryTessellationRate(geom_id, curve_subdivisions);
+  rtcSetGeometryTessellationRate(geom_id, curve_subdivisions + 1);
   unsigned *rtc_indices = (unsigned *)rtcSetNewGeometryBuffer(
       geom_id, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT, sizeof(int), num_segments);
   size_t rtc_index = 0;
@@ -842,14 +809,11 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
     Hair::Curve c = hair->get_curve(j);
     for (size_t k = 0; k < c.num_segments(); ++k) {
       rtc_indices[rtc_index] = c.first_key + k;
-      if (use_curves) {
-        /* Room for extra CVs at Catmull-Rom splines. */
-        rtc_indices[rtc_index] += j * 2;
-      }
+      /* Room for extra CVs at Catmull-Rom splines. */
+      rtc_indices[rtc_index] += j * 2;
       /* Cycles specific data. */
       pack.prim_object[prim_object_size + rtc_index] = i;
-      pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(
-          num_motion_steps > 1 ? PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k));
+      pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(primitive_type, k));
       pack.prim_index[prim_index_size + rtc_index] = j;
       pack.prim_tri_index[prim_tri_index_size + rtc_index] = rtc_index;
 
@@ -863,8 +827,13 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
   update_curve_vertex_buffer(geom_id, hair);
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
-  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
-  rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+  if (hair->curve_shape == CURVE_RIBBON) {
+    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+  }
+  else {
+    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_thick_curve);
+    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_thick_curve);
+  }
   rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
diff --git a/intern/cycles/bvh/bvh_embree.h b/intern/cycles/bvh/bvh_embree.h
index eb121d060b7..f60a1ca0102 100644
--- a/intern/cycles/bvh/bvh_embree.h
+++ b/intern/cycles/bvh/bvh_embree.h
@@ -81,7 +81,7 @@ class BVHEmbree : public BVH {
   vector<RTCScene> delayed_delete_scenes;
   int curve_subdivisions;
   enum RTCBuildQuality build_quality;
-  bool use_curves, use_ribbons, dynamic_scene;
+  bool dynamic_scene;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp
index 740994b2ebc..ccb7ae08625 100644
--- a/intern/cycles/bvh/bvh_optix.cpp
+++ b/intern/cycles/bvh/bvh_optix.cpp
@@ -18,10 +18,14 @@
 #ifdef WITH_OPTIX
 
 #  include "bvh/bvh_optix.h"
+
+#  include "device/device.h"
+
 #  include "render/geometry.h"
 #  include "render/hair.h"
 #  include "render/mesh.h"
 #  include "render/object.h"
+
 #  include "util/util_foreach.h"
 #  include "util/util_logging.h"
 #  include "util/util_progress.h"
@@ -73,9 +77,12 @@ void BVHOptiX::pack_blas()
       // 'pack.prim_time' is only used in geom_curve_intersect.h
       // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH
 
-      uint type = PRIMITIVE_CURVE;
-      if (hair->use_motion_blur && hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
-        type = PRIMITIVE_MOTION_CURVE;
+      uint type = (hair->use_motion_blur &&
+                   hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) ?
+                      ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON :
+                                                             PRIMITIVE_MOTION_CURVE_THICK) :
+                      ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON :
+                                                             PRIMITIVE_CURVE_THICK);
 
       for (size_t j = 0; j < num_curves; ++j) {
         const Hair::Curve curve = hair->get_curve(j);
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 5e2c4b63f1b..1a50742dc33 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -89,7 +89,6 @@ class BVHParams {
   int bvh_type;
 
   /* These are needed for Embree. */
-  int curve_flags;
   int curve_subdivisions;
 
   /* fixed parameters */
@@ -122,7 +121,6 @@ class BVHParams {
 
     bvh_type = 0;
 
-    curve_flags = 0;
     curve_subdivisions = 4;
   }
 
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index 4498a759c08..b01785b547a 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -88,18 +88,6 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
                                         const int job_end,
                                         const BVHReferenceCompare &compare);
 
-class BVHSortTask : public Task {
- public:
-  BVHSortTask(TaskPool *task_pool,
-              BVHReference *data,
-              const int job_start,
-              const int job_end,
-              const BVHReferenceCompare &compare)
-  {
-    run = function_bind(bvh_reference_sort_threaded, task_pool, data, job_start, job_end, compare);
-  }
-};
-
 /* Multi-threaded reference sort. */
 static void bvh_reference_sort_threaded(TaskPool *task_pool,
                                         BVHReference *data,
@@ -158,7 +146,8 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
     have_work = false;
     if (left < end) {
       if (start < right) {
-        task_pool->push(new BVHSortTask(task_pool, data, left, end, compare), true);
+        task_pool->push(
+            function_bind(bvh_reference_sort_threaded, task_pool, data, left, end, compare));
       }
       else {
         start = left;
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index acdca0f13ad..4b21f852d7a 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -33,7 +33,7 @@ CCL_NAMESPACE_BEGIN
 BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
                                BVHSpatialStorage *storage,
                                const BVHRange &range,
-                               vector<BVHReference> *references,
+                               vector<BVHReference> &references,
                                float nodeSAH,
                                const BVHUnaligned *unaligned_heuristic,
                                const Transform *aligned_space)
@@ -43,7 +43,7 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
       left_bounds(BoundBox::empty),
       right_bounds(BoundBox::empty),
       storage_(storage),
-      references_(references),
+      references_(&references),
       unaligned_heuristic_(unaligned_heuristic),
       aligned_space_(aligned_space)
 {
@@ -133,7 +133,7 @@ void BVHObjectSplit::split(BVHRange &left, BVHRange &right, const BVHRange &rang
 BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder,
                                  BVHSpatialStorage *storage,
                                  const BVHRange &range,
-                                 vector<BVHReference> *references,
+                                 vector<BVHReference> &references,
                                  float nodeSAH,
                                  const BVHUnaligned *unaligned_heuristic,
                                  const Transform *aligned_space)
@@ -141,7 +141,7 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder,
       dim(0),
       pos(0.0f),
       storage_(storage),
-      references_(references),
+      references_(&references),
       unaligned_heuristic_(unaligned_heuristic),
       aligned_space_(aligned_space)
 {
@@ -152,7 +152,7 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder,
   }
   else {
     range_bounds = unaligned_heuristic->compute_aligned_boundbox(
-        range, &references->at(0), *aligned_space);
+        range, &references_->at(0), *aligned_space);
   }
 
   float3 origin = range_bounds.min;
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index 5f2e41cf343..28ff0e05fc3 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -44,7 +44,7 @@ class BVHObjectSplit {
   BVHObjectSplit(BVHBuild *builder,
                  BVHSpatialStorage *storage,
                  const BVHRange &range,
-                 vector<BVHReference> *references,
+                 vector<BVHReference> &references,
                  float nodeSAH,
                  const BVHUnaligned *unaligned_heuristic = NULL,
                  const Transform *aligned_space = NULL);
@@ -82,7 +82,7 @@ class BVHSpatialSplit {
   BVHSpatialSplit(const BVHBuild &builder,
                   BVHSpatialStorage *storage,
                   const BVHRange &range,
-                  vector<BVHReference> *references,
+                  vector<BVHReference> &references,
                   float nodeSAH,
                   const BVHUnaligned *unaligned_heuristic = NULL,
                   const Transform *aligned_space = NULL);
@@ -187,7 +187,7 @@ class BVHMixedSplit {
   __forceinline BVHMixedSplit(BVHBuild *builder,
                               BVHSpatialStorage *storage,
                               const BVHRange &range,
-                              vector<BVHReference> *references,
+                              vector<BVHReference> &references,
                               int level,
                               const BVHUnaligned *unaligned_heuristic = NULL,
                               const Transform *aligned_space = NULL)
@@ -197,7 +197,7 @@ class BVHMixedSplit {
     }
     else {
       bounds = unaligned_heuristic->compute_aligned_boundbox(
-          range, &references->at(0), *aligned_space);
+          range, &references.at(0), *aligned_space);
     }
     /* find split candidates. */
     float area = bounds.safe_area();
@@ -220,7 +220,7 @@ class BVHMixedSplit {
 
     /* leaf SAH is the lowest => create leaf. */
     minSAH = min(min(leafSAH, object.sah), spatial.sah);
-    no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range, *references));
+    no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range, references));
   }
 
   __forceinline void split(BVHBuild *builder,
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index f0995f343fe..c969b361643 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -68,7 +68,8 @@ bool BVHUnaligned::compute_aligned_space(const BVHReference &ref, Transform *ali
   const Object *object = objects_[ref.prim_object()];
   const int packed_type = ref.prim_type();
   const int type = (packed_type & PRIMITIVE_ALL);
-  if (type & PRIMITIVE_CURVE) {
+  /* No motion blur curves here, we can't fit them to aligned boxes well. */
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
     const int curve_index = ref.prim_index();
     const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
     const Hair *hair = static_cast<const Hair *>(object->geometry);
@@ -93,7 +94,8 @@ BoundBox BVHUnaligned::compute_aligned_prim_boundbox(const BVHReference &prim,
   const Object *object = objects_[prim.prim_object()];
   const int packed_type = prim.prim_type();
   const int type = (packed_type & PRIMITIVE_ALL);
-  if (type & PRIMITIVE_CURVE) {
+  /* No motion blur curves here, we can't fit them to aligned boxes well. */
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
     const int curve_index = prim.prim_index();
     const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
     const Hair *hair = static_cast<const Hair *>(object->geometry);
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index aa5b65a2b73..ca366722eb7 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -99,6 +99,18 @@ if(WITH_CYCLES_DEVICE_MULTI)
   add_definitions(-DWITH_MULTI)
 endif()
 
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  add_definitions(-DOIDN_STATIC_LIB)
+  list(APPEND INC_SYS
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+    ${TBB_LIBRARIES}
+  )
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 1aa2fdd0967..e5e3e24165d 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -21,6 +21,7 @@
 #  include "device/device_split_kernel.h"
 
 #  include "util/util_map.h"
+#  include "util/util_task.h"
 
 #  ifdef WITH_CUDA_DYNLOAD
 #    include "cuew.h"
@@ -96,9 +97,9 @@ class CUDADevice : public Device {
 
   static bool have_precompiled_kernels();
 
-  virtual bool show_samples() const;
+  virtual bool show_samples() const override;
 
-  virtual BVHLayoutMask get_bvh_layout_mask() const;
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
 
   void set_error(const string &error) override;
 
@@ -108,7 +109,7 @@ class CUDADevice : public Device {
 
   bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
 
-  bool check_peer_access(Device *peer_device);
+  bool check_peer_access(Device *peer_device) override;
 
   bool use_adaptive_compilation();
 
@@ -122,7 +123,7 @@ class CUDADevice : public Device {
                         const char *base = "cuda",
                         bool force_ptx = false);
 
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
+  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
 
   void load_functions();
 
@@ -140,19 +141,19 @@ class CUDADevice : public Device {
 
   void generic_free(device_memory &mem);
 
-  void mem_alloc(device_memory &mem);
+  void mem_alloc(device_memory &mem) override;
 
-  void mem_copy_to(device_memory &mem);
+  void mem_copy_to(device_memory &mem) override;
 
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
 
-  void mem_zero(device_memory &mem);
+  void mem_zero(device_memory &mem) override;
 
-  void mem_free(device_memory &mem);
+  void mem_free(device_memory &mem) override;
 
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/);
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
 
-  virtual void const_copy_to(const char *name, void *host, size_t size);
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
 
   void global_alloc(device_memory &mem);
 
@@ -252,15 +253,15 @@ class CUDADevice : public Device {
                    int dw,
                    int dh,
                    bool transparent,
-                   const DeviceDrawParams &draw_params);
+                   const DeviceDrawParams &draw_params) override;
 
-  void thread_run(DeviceTask *task);
+  void thread_run(DeviceTask &task);
 
-  virtual void task_add(DeviceTask &task);
+  virtual void task_add(DeviceTask &task) override;
 
-  virtual void task_wait();
+  virtual void task_wait() override;
 
-  virtual void task_cancel();
+  virtual void task_cancel() override;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 7aa63ff48c3..b9bbeb9a25b 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -105,7 +105,7 @@ class CUDASplitKernel : public DeviceSplitKernel {
   virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
                                                          const DeviceRequestedFeatures &);
   virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
 };
 
 /* Utility to push/pop CUDA context. */
@@ -243,7 +243,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
 
 CUDADevice::~CUDADevice()
 {
-  task_pool.stop();
+  task_pool.cancel();
 
   delete split_kernel;
 
@@ -2326,11 +2326,11 @@ void CUDADevice::draw_pixels(device_memory &mem,
   Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
 }
 
-void CUDADevice::thread_run(DeviceTask *task)
+void CUDADevice::thread_run(DeviceTask &task)
 {
   CUDAContextScope scope(this);
 
-  if (task->type == DeviceTask::RENDER) {
+  if (task.type == DeviceTask::RENDER) {
     DeviceRequestedFeatures requested_features;
     if (use_split_kernel()) {
       if (split_kernel == NULL) {
@@ -2343,72 +2343,64 @@ void CUDADevice::thread_run(DeviceTask *task)
 
     /* keep rendering tiles until done */
     RenderTile tile;
-    DenoisingTask denoising(this, *task);
+    DenoisingTask denoising(this, task);
 
-    while (task->acquire_tile(this, tile, task->tile_types)) {
+    while (task.acquire_tile(this, tile, task.tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel()) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
           split_kernel->path_trace(task, tile, void_buffer, void_buffer);
         }
         else {
-          render(*task, tile, work_tiles);
+          render(task, tile, work_tiles);
         }
       }
       else if (tile.task == RenderTile::BAKE) {
-        render(*task, tile, work_tiles);
+        render(task, tile, work_tiles);
       }
       else if (tile.task == RenderTile::DENOISE) {
         tile.sample = tile.start_sample + tile.num_samples;
 
         denoise(tile, denoising);
 
-        task->update_progress(&tile, tile.w * tile.h);
+        task.update_progress(&tile, tile.w * tile.h);
       }
 
-      task->release_tile(tile);
+      task.release_tile(tile);
 
-      if (task->get_cancel()) {
-        if (task->need_finish_queue == false)
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
           break;
       }
     }
 
     work_tiles.free();
   }
-  else if (task->type == DeviceTask::SHADER) {
-    shader(*task);
+  else if (task.type == DeviceTask::SHADER) {
+    shader(task);
 
     cuda_assert(cuCtxSynchronize());
   }
-  else if (task->type == DeviceTask::DENOISE_BUFFER) {
+  else if (task.type == DeviceTask::DENOISE_BUFFER) {
     RenderTile tile;
-    tile.x = task->x;
-    tile.y = task->y;
-    tile.w = task->w;
-    tile.h = task->h;
-    tile.buffer = task->buffer;
-    tile.sample = task->sample + task->num_samples;
-    tile.num_samples = task->num_samples;
-    tile.start_sample = task->sample;
-    tile.offset = task->offset;
-    tile.stride = task->stride;
-    tile.buffers = task->buffers;
-
-    DenoisingTask denoising(this, *task);
+    tile.x = task.x;
+    tile.y = task.y;
+    tile.w = task.w;
+    tile.h = task.h;
+    tile.buffer = task.buffer;
+    tile.sample = task.sample + task.num_samples;
+    tile.num_samples = task.num_samples;
+    tile.start_sample = task.sample;
+    tile.offset = task.offset;
+    tile.stride = task.stride;
+    tile.buffers = task.buffers;
+
+    DenoisingTask denoising(this, task);
     denoise(tile, denoising);
-    task->update_progress(&tile, tile.w * tile.h);
+    task.update_progress(&tile, tile.w * tile.h);
   }
 }
 
-class CUDADeviceTask : public DeviceTask {
- public:
-  CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
-  {
-    run = function_bind(&CUDADevice::thread_run, device, this);
-  }
-};
-
 void CUDADevice::task_add(DeviceTask &task)
 {
   CUDAContextScope scope(this);
@@ -2424,7 +2416,10 @@ void CUDADevice::task_add(DeviceTask &task)
     film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
   }
   else {
-    task_pool.push(new CUDADeviceTask(this, task));
+    task_pool.push([=] {
+      DeviceTask task_copy = task;
+      thread_run(task_copy);
+    });
   }
 }
 
@@ -2652,7 +2647,7 @@ int2 CUDASplitKernel::split_kernel_local_size()
 
 int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
                                                device_memory &data,
-                                               DeviceTask * /*task*/)
+                                               DeviceTask & /*task*/)
 {
   CUDAContextScope scope(device);
   size_t free;
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 41dd7894d93..9dbb33980b4 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -77,7 +77,7 @@ std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &reques
 
 /* Device */
 
-Device::~Device()
+Device::~Device() noexcept(false)
 {
   if (!background) {
     if (vertex_buffer != 0) {
@@ -603,6 +603,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   info.has_osl = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
+  info.denoisers = DENOISER_ALL;
 
   foreach (const DeviceInfo &device, subdevices) {
     /* Ensure CPU device does not slow down GPU. */
@@ -647,6 +648,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
+    info.denoisers &= device.denoisers;
   }
 
   return info;
@@ -667,4 +669,55 @@ void Device::free_memory()
   network_devices.free_memory();
 }
 
+/* DeviceInfo */
+
+void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+{
+  assert(denoising_devices.empty());
+
+  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
+    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
+    if (!optix_devices.empty()) {
+      /* Convert to a special multi device with separate denoising devices. */
+      if (multi_devices.empty()) {
+        multi_devices.push_back(*this);
+      }
+
+      /* Try to use the same physical devices for denoising. */
+      for (const DeviceInfo &cuda_device : multi_devices) {
+        if (cuda_device.type == DEVICE_CUDA) {
+          for (const DeviceInfo &optix_device : optix_devices) {
+            if (cuda_device.num == optix_device.num) {
+              id += optix_device.id;
+              denoising_devices.push_back(optix_device);
+              break;
+            }
+          }
+        }
+      }
+
+      if (denoising_devices.empty()) {
+        /* Simply use the first available OptiX device. */
+        const DeviceInfo optix_device = optix_devices.front();
+        id += optix_device.id; /* Uniquely identify this special multi device. */
+        denoising_devices.push_back(optix_device);
+      }
+
+      denoisers = denoiser_type;
+    }
+  }
+  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
+    /* Convert to a special multi device with separate denoising devices. */
+    if (multi_devices.empty()) {
+      multi_devices.push_back(*this);
+    }
+
+    /* Add CPU denoising devices. */
+    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
+    denoising_devices.push_back(cpu_device);
+
+    denoisers = denoiser_type;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index dff981080a5..a5833369a17 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -83,6 +83,7 @@ class DeviceInfo {
   bool use_split_kernel;             /* Use split or mega kernel. */
   bool has_profiling;                /* Supports runtime collection of profiling info. */
   bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
+  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
   vector<DeviceInfo> denoising_devices;
@@ -101,6 +102,7 @@ class DeviceInfo {
     use_split_kernel = false;
     has_profiling = false;
     has_peer_memory = false;
+    denoisers = DENOISER_NONE;
   }
 
   bool operator==(const DeviceInfo &info)
@@ -110,6 +112,9 @@ class DeviceInfo {
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
+
+  /* Add additional devices needed for the specified denoiser. */
+  void add_denoising_devices(DenoiserType denoiser_type);
 };
 
 class DeviceRequestedFeatures {
@@ -132,6 +137,7 @@ class DeviceRequestedFeatures {
 
   /* BVH/sampling kernel features. */
   bool use_hair;
+  bool use_hair_thick;
   bool use_object_motion;
   bool use_camera_motion;
 
@@ -178,6 +184,7 @@ class DeviceRequestedFeatures {
     max_nodes_group = 0;
     nodes_features = 0;
     use_hair = false;
+    use_hair_thick = false;
     use_object_motion = false;
     use_camera_motion = false;
     use_baking = false;
@@ -200,6 +207,7 @@ class DeviceRequestedFeatures {
              max_nodes_group == requested_features.max_nodes_group &&
              nodes_features == requested_features.nodes_features &&
              use_hair == requested_features.use_hair &&
+             use_hair_thick == requested_features.use_hair_thick &&
              use_object_motion == requested_features.use_object_motion &&
              use_camera_motion == requested_features.use_camera_motion &&
              use_baking == requested_features.use_baking &&
@@ -319,7 +327,8 @@ class Device {
   virtual void mem_free_sub_ptr(device_ptr /*ptr*/){};
 
  public:
-  virtual ~Device();
+  /* noexcept needed to silence TBB warning. */
+  virtual ~Device() noexcept(false);
 
   /* info */
   DeviceInfo info;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index fc6febd8cee..8f68e66a1b4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -51,10 +51,12 @@
 #include "util/util_function.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_openimagedenoise.h"
 #include "util/util_opengl.h"
 #include "util/util_optimization.h"
 #include "util/util_progress.h"
 #include "util/util_system.h"
+#include "util/util_task.h"
 #include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
@@ -161,7 +163,7 @@ class CPUSplitKernel : public DeviceSplitKernel {
   virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
                                                          const DeviceRequestedFeatures &);
   virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
   virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
 };
 
@@ -176,6 +178,10 @@ class CPUDevice : public Device {
 #ifdef WITH_OSL
   OSLGlobals osl_globals;
 #endif
+#ifdef WITH_OPENIMAGEDENOISE
+  oidn::DeviceRef oidn_device;
+  oidn::FilterRef oidn_filter;
+#endif
 
   bool use_split_kernel;
 
@@ -332,7 +338,7 @@ class CPUDevice : public Device {
 
   ~CPUDevice()
   {
-    task_pool.stop();
+    task_pool.cancel();
     texture_info.free();
   }
 
@@ -344,17 +350,6 @@ class CPUDevice : public Device {
   virtual BVHLayoutMask get_bvh_layout_mask() const
   {
     BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      bvh_layout_mask |= BVH_LAYOUT_BVH4;
-    }
-    /* MSVC does not support the -march=native switch and you always end up  */
-    /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */
-    /* that kernel BVH8 even if the CPU flags would allow for it. */
-#if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE))
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      bvh_layout_mask |= BVH_LAYOUT_BVH8;
-    }
-#endif
 #ifdef WITH_EMBREE
     bvh_layout_mask |= BVH_LAYOUT_EMBREE;
 #endif /* WITH_EMBREE */
@@ -527,26 +522,18 @@ class CPUDevice : public Device {
 #endif
   }
 
-  void thread_run(DeviceTask *task)
+  void thread_run(DeviceTask &task)
   {
-    if (task->type == DeviceTask::RENDER)
-      thread_render(*task);
-    else if (task->type == DeviceTask::SHADER)
-      thread_shader(*task);
-    else if (task->type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(*task);
-    else if (task->type == DeviceTask::DENOISE_BUFFER)
-      thread_denoise(*task);
+    if (task.type == DeviceTask::RENDER)
+      thread_render(task);
+    else if (task.type == DeviceTask::SHADER)
+      thread_shader(task);
+    else if (task.type == DeviceTask::FILM_CONVERT)
+      thread_film_convert(task);
+    else if (task.type == DeviceTask::DENOISE_BUFFER)
+      thread_denoise(task);
   }
 
-  class CPUDeviceTask : public DeviceTask {
-   public:
-    CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&CPUDevice::thread_run, device, this);
-    }
-  };
-
   bool denoising_non_local_means(device_ptr image_ptr,
                                  device_ptr guide_ptr,
                                  device_ptr variance_ptr,
@@ -961,7 +948,71 @@ class CPUDevice : public Device {
     }
   }
 
-  void denoise(DenoisingTask &denoising, RenderTile &tile)
+  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  {
+#ifdef WITH_OPENIMAGEDENOISE
+    assert(openimagedenoise_supported());
+
+    /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+    static thread_mutex mutex;
+    thread_scoped_lock lock(mutex);
+
+    /* Create device and filter, cached for reuse. */
+    if (!oidn_device) {
+      oidn_device = oidn::newDevice();
+      oidn_device.commit();
+    }
+    if (!oidn_filter) {
+      oidn_filter = oidn_device.newFilter("RT");
+    }
+
+    /* Copy pixels from compute device to CPU (no-op for CPU device). */
+    rtile.buffers->buffer.copy_from_device();
+
+    /* Set images with appropriate stride for our interleaved pass storage. */
+    const struct {
+      const char *name;
+      int offset;
+    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR},
+                  {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL},
+                  {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO},
+                  {"output", 0},
+                  { NULL,
+                    0 }};
+
+    for (int i = 0; passes[i].name; i++) {
+      const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
+      const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
+      const int64_t pixel_stride = task.pass_stride * sizeof(float);
+      const int64_t row_stride = rtile.stride * pixel_stride;
+
+      oidn_filter.setImage(passes[i].name,
+                           (char *)rtile.buffer + buffer_offset,
+                           oidn::Format::Float3,
+                           rtile.w,
+                           rtile.h,
+                           0,
+                           pixel_stride,
+                           row_stride);
+    }
+
+    /* Execute filter. */
+    oidn_filter.set("hdr", true);
+    oidn_filter.set("srgb", false);
+    oidn_filter.commit();
+    oidn_filter.execute();
+
+    /* todo: it may be possible to avoid this copy, but we have to ensure that
+     * when other code copies data from the device it doesn't overwrite the
+     * denoiser buffers. */
+    rtile.buffers->buffer.copy_to_device();
+#else
+    (void)task;
+    (void)rtile;
+#endif
+  }
+
+  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
   {
     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
 
@@ -1019,15 +1070,14 @@ class CPUDevice : public Device {
       }
     }
 
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-    denoising.profiler = &kg->profiler;
+    DenoisingTask *denoising = NULL;
 
+    RenderTile tile;
     while (task.acquire_tile(this, tile, task.tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
+          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
         }
         else {
           render(task, tile, kg);
@@ -1037,7 +1087,16 @@ class CPUDevice : public Device {
         render(task, tile, kg);
       }
       else if (tile.task == RenderTile::DENOISE) {
-        denoise(denoising, tile);
+        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+          denoise_openimagedenoise(task, tile);
+        }
+        else if (task.denoising.type == DENOISER_NLM) {
+          if (denoising == NULL) {
+            denoising = new DenoisingTask(this, task);
+            denoising->profiler = &kg->profiler;
+          }
+          denoise_nlm(*denoising, tile);
+        }
         task.update_progress(&tile, tile.w * tile.h);
       }
 
@@ -1055,6 +1114,7 @@ class CPUDevice : public Device {
     kg->~KernelGlobals();
     kgbuffer.free();
     delete split_kernel;
+    delete denoising;
   }
 
   void thread_denoise(DeviceTask &task)
@@ -1072,16 +1132,22 @@ class CPUDevice : public Device {
     tile.stride = task.stride;
     tile.buffers = task.buffers;
 
-    DenoisingTask denoising(this, task);
+    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      denoise_openimagedenoise(task, tile);
+    }
+    else {
+      DenoisingTask denoising(this, task);
 
-    ProfilingState denoising_profiler_state;
-    profiler.add_state(&denoising_profiler_state);
-    denoising.profiler = &denoising_profiler_state;
+      ProfilingState denoising_profiler_state;
+      profiler.add_state(&denoising_profiler_state);
+      denoising.profiler = &denoising_profiler_state;
 
-    denoise(denoising, tile);
-    task.update_progress(&tile, tile.w * tile.h);
+      denoise_nlm(denoising, tile);
+
+      profiler.remove_state(&denoising_profiler_state);
+    }
 
-    profiler.remove_state(&denoising_profiler_state);
+    task.update_progress(&tile, tile.w * tile.h);
   }
 
   void thread_film_convert(DeviceTask &task)
@@ -1155,13 +1221,24 @@ class CPUDevice : public Device {
     /* split task into smaller ones */
     list<DeviceTask> tasks;
 
-    if (task.type == DeviceTask::SHADER)
+    if (task.type == DeviceTask::DENOISE_BUFFER &&
+        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      /* Denoise entire buffer at once with OIDN, it has own threading. */
+      tasks.push_back(task);
+    }
+    else if (task.type == DeviceTask::SHADER) {
       task.split(tasks, info.cpu_threads, 256);
-    else
+    }
+    else {
       task.split(tasks, info.cpu_threads);
+    }
 
-    foreach (DeviceTask &task, tasks)
-      task_pool.push(new CPUDeviceTask(this, task));
+    foreach (DeviceTask &task, tasks) {
+      task_pool.push([=] {
+        DeviceTask task_copy = task;
+        thread_run(task_copy);
+      });
+    }
   }
 
   void task_wait()
@@ -1326,7 +1403,7 @@ int2 CPUSplitKernel::split_kernel_local_size()
 
 int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
                                               device_memory & /*data*/,
-                                              DeviceTask * /*task*/)
+                                              DeviceTask & /*task*/)
 {
   return make_int2(1, 1);
 }
@@ -1358,6 +1435,10 @@ void device_cpu_info(vector<DeviceInfo> &devices)
   info.has_osl = true;
   info.has_half_images = true;
   info.has_profiling = true;
+  info.denoisers = DENOISER_NLM;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
 
   devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 04c04761311..d9ffcceb06e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -130,6 +130,7 @@ void device_cuda_info(vector<DeviceInfo> &devices)
     info.has_half_images = (major >= 3);
     info.has_volume_decoupled = false;
     info.has_adaptive_stop_per_sample = false;
+    info.denoisers = DENOISER_NLM;
 
     /* Check if the device has P2P access to any other device in the system. */
     for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index ac17c02a427..89de80a5bcd 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -56,8 +56,8 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
     tile_info->frames[i] = task.denoising_frames[i - 1];
   }
 
-  write_passes = task.denoising_write_passes;
-  do_filter = task.denoising_do_filter;
+  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
+  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
 }
 
 DenoisingTask::~DenoisingTask()
@@ -91,7 +91,7 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
   target_buffer.stride = rtiles[9].stride;
   target_buffer.ptr = rtiles[9].buffer;
 
-  if (write_passes && rtiles[9].buffers) {
+  if (do_prefilter && rtiles[9].buffers) {
     target_buffer.denoising_output_offset =
         rtiles[9].buffers->params.get_denoising_prefiltered_offset();
   }
@@ -111,7 +111,7 @@ void DenoisingTask::setup_denoising_buffer()
   rect = rect_clip(rect,
                    make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
 
-  buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
+  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
   buffer.passes = buffer.use_intensity ? 15 : 14;
   buffer.width = rect.z - rect.x;
   buffer.stride = align_up(buffer.width, 4);
@@ -343,7 +343,7 @@ void DenoisingTask::run_denoising(RenderTile *tile)
     reconstruct();
   }
 
-  if (write_passes) {
+  if (do_prefilter) {
     write_buffer();
   }
 
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index bd1d0193dbd..4c122e981eb 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -60,7 +60,7 @@ class DenoisingTask {
   int4 rect;
   int4 filter_area;
 
-  bool write_passes;
+  bool do_prefilter;
   bool do_filter;
 
   struct DeviceFunctions {
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 020b9e10e60..fd14bbdccc5 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -396,8 +396,8 @@ class MultiDevice : public Device {
     size_t existing_size = mem.device_size;
 
     /* This is a hack to only allocate the tile buffers on denoising devices
-     * Similarily the tile buffers also need to be allocated separately on all devices so any
-     * overlap rendered for denoising does not interfer with each other */
+     * Similarly the tile buffers also need to be allocated separately on all devices so any
+     * overlap rendered for denoising does not interfere with each other */
     if (strcmp(mem.name, "RenderBuffers") == 0) {
       vector<device_ptr> device_pointers;
       device_pointers.reserve(devices.size());
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 0933d51f321..8904b517e92 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -313,6 +313,7 @@ void device_network_info(vector<DeviceInfo> &devices)
   info.has_volume_decoupled = false;
   info.has_adaptive_stop_per_sample = false;
   info.has_osl = false;
+  info.denoisers = DENOISER_NONE;
 
   devices.push_back(info);
 }
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 8a0b128697f..39b9ef70192 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -120,6 +120,7 @@ void device_opencl_info(vector<DeviceInfo> &devices)
     info.use_split_kernel = true;
     info.has_volume_decoupled = false;
     info.has_adaptive_stop_per_sample = false;
+    info.denoisers = DENOISER_NLM;
     info.id = id;
 
     /* Check OpenCL extensions */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index fbf6a914744..ececca3df53 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -246,7 +246,7 @@ class OptiXDevice : public CUDADevice {
   ~OptiXDevice()
   {
     // Stop processing any more tasks
-    task_pool.stop();
+    task_pool.cancel();
 
     // Make CUDA context current
     const CUDAContextScope scope(cuContext);
@@ -428,11 +428,20 @@ class OptiXDevice : public CUDADevice {
     group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
 
     if (requested_features.use_hair) {
-      // Add curve intersection programs
       group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve";
       group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve";
+
+      // Add curve intersection programs
+      if (requested_features.use_hair_thick) {
+        // Slower programs for thick hair since that also slows down ribbons.
+        // Ideally this should not be needed.
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
+      }
+      else {
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      }
     }
 
     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@@ -712,7 +721,7 @@ class OptiXDevice : public CUDADevice {
     const CUDAContextScope scope(cuContext);
 
     // Choose between OptiX and NLM denoising
-    if (task.denoising_use_optix) {
+    if (task.denoising.type == DENOISER_OPTIX) {
       // Map neighboring tiles onto this device, indices are as following:
       // Where index 4 is the center tile and index 9 is the target for the result.
       //   0 1 2
@@ -1436,21 +1445,21 @@ class OptiXDevice : public CUDADevice {
       KernelData *const data = (KernelData *)host;
       *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
 
-      update_launch_params(name, offsetof(KernelParams, data), host, size);
+      update_launch_params(offsetof(KernelParams, data), host, size);
       return;
     }
 
     // Update data storage pointers in launch parameters
 #  define KERNEL_TEX(data_type, tex_name) \
     if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \
+      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
       return; \
     }
 #  include "kernel/kernel_textures.h"
 #  undef KERNEL_TEX
   }
 
-  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
+  void update_launch_params(size_t offset, void *data, size_t data_size)
   {
     const CUDAContextScope scope(cuContext);
 
@@ -1463,15 +1472,6 @@ class OptiXDevice : public CUDADevice {
 
   void task_add(DeviceTask &task) override
   {
-    struct OptiXDeviceTask : public DeviceTask {
-      OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
-      {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
-      }
-    };
-
     // Upload texture information to device if it has changed since last launch
     load_texture_info();
 
@@ -1483,7 +1483,10 @@ class OptiXDevice : public CUDADevice {
 
     if (task.type == DeviceTask::DENOISE_BUFFER) {
       // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
-      task_pool.push(new OptiXDeviceTask(this, task, 0));
+      task_pool.push([=] {
+        DeviceTask task_copy = task;
+        thread_run(task_copy, 0);
+      });
       return;
     }
 
@@ -1493,8 +1496,15 @@ class OptiXDevice : public CUDADevice {
 
     // Queue tasks in internal task pool
     int task_index = 0;
-    for (DeviceTask &task : tasks)
-      task_pool.push(new OptiXDeviceTask(this, task, task_index++));
+    for (DeviceTask &task : tasks) {
+      task_pool.push([=] {
+        // Using task index parameter instead of thread index, since number of CUDA streams may
+        // differ from number of threads
+        DeviceTask task_copy = task;
+        thread_run(task_copy, task_index);
+      });
+      task_index++;
+    }
   }
 
   void task_wait() override
@@ -1551,6 +1561,7 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo
 
     info.type = DEVICE_OPTIX;
     info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
 
     devices.push_back(info);
   }
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index f22d8761058..4c288f60c16 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -145,7 +145,7 @@ size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
   return max_buffer_size / size_per_element;
 }
 
-bool DeviceSplitKernel::path_trace(DeviceTask *task,
+bool DeviceSplitKernel::path_trace(DeviceTask &task,
                                    RenderTile &tile,
                                    device_memory &kgbuffer,
                                    device_memory &kernel_data)
@@ -222,9 +222,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     subtile.start_sample = tile.sample;
     subtile.num_samples = samples_per_second;
 
-    if (task->adaptive_sampling.use) {
-      subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample,
-                                                                          subtile.num_samples);
+    if (task.adaptive_sampling.use) {
+      subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+                                                                         subtile.num_samples);
     }
 
     /* Don't go beyond requested number of samples. */
@@ -286,7 +286,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
         ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
         ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
 
-        if (task->get_cancel() && cancel_time == DBL_MAX) {
+        if (task.get_cancel() && cancel_time == DBL_MAX) {
           /* Wait up to twice as many seconds for current samples to finish
            * to avoid artifacts in render result from ending too soon.
            */
@@ -323,7 +323,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     }
 
     int filter_sample = tile.sample + subtile.num_samples - 1;
-    if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) {
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
       size_t buffer_size[2];
       buffer_size[0] = round_up(tile.w, local_size[0]);
       buffer_size[1] = round_up(tile.h, local_size[1]);
@@ -352,16 +352,16 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 #undef ENQUEUE_SPLIT_KERNEL
 
     tile.sample += subtile.num_samples;
-    task->update_progress(&tile, tile.w * tile.h * subtile.num_samples);
+    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
 
     time_multiplier = min(time_multiplier << 1, 10);
 
-    if (task->get_cancel()) {
+    if (task.get_cancel()) {
       return true;
     }
   }
 
-  if (task->adaptive_sampling.use) {
+  if (task.adaptive_sampling.use) {
     /* Reset the start samples. */
     RenderTile subtile = tile;
     subtile.start_sample = tile.start_sample;
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 9d6b9efdd62..07a21b10299 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -109,7 +109,7 @@ class DeviceSplitKernel {
   virtual ~DeviceSplitKernel();
 
   bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask *task,
+  bool path_trace(DeviceTask &task,
                   RenderTile &rtile,
                   device_memory &kgbuffer,
                   device_memory &kernel_data);
@@ -137,7 +137,7 @@ class DeviceSplitKernel {
   virtual int2 split_kernel_local_size() = 0;
   virtual int2 split_kernel_global_size(device_memory &kg,
                                         device_memory &data,
-                                        DeviceTask *task) = 0;
+                                        DeviceTask &task) = 0;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 7485e1b41de..6e7c184c6c9 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -50,7 +50,7 @@ DeviceTask::DeviceTask(Type type_)
   last_update_time = time_dt();
 }
 
-int DeviceTask::get_subtask_count(int num, int max_size)
+int DeviceTask::get_subtask_count(int num, int max_size) const
 {
   if (max_size != 0) {
     int max_size_num;
@@ -78,7 +78,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
   return num;
 }
 
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
+void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
 {
   num = get_subtask_count(num, max_size);
 
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8c4e682adb1..600973b8100 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -21,7 +21,6 @@
 
 #include "util/util_function.h"
 #include "util/util_list.h"
-#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -32,8 +31,33 @@ class RenderBuffers;
 class RenderTile;
 class Tile;
 
+enum DenoiserType {
+  DENOISER_NLM = 1,
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+typedef int DenoiserTypeMask;
+
 class DenoiseParams {
  public:
+  /* Apply denoiser to image. */
+  bool use;
+  /* Output denoising data passes (possibly without applying the denoiser). */
+  bool store_passes;
+
+  /* Denoiser type. */
+  DenoiserType type;
+
+  /* Viewport start sample. */
+  int start_sample;
+
+  /** Native Denoiser **/
+
   /* Pixel radius for neighboring pixels to take into account. */
   int radius;
   /* Controls neighbor pixel weighting for the denoising filter. */
@@ -47,18 +71,36 @@ class DenoiseParams {
   int neighbor_frames;
   /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
   bool clamp_input;
+
+  /** Optix Denoiser **/
+
   /* Passes handed over to the OptiX denoiser (default to color + albedo). */
   int optix_input_passes;
 
   DenoiseParams()
   {
+    use = false;
+    store_passes = false;
+
+    type = DENOISER_NLM;
+
     radius = 8;
     strength = 0.5f;
     feature_strength = 0.5f;
     relative_pca = false;
     neighbor_frames = 2;
     clamp_input = true;
+
     optix_input_passes = 2;
+
+    start_sample = 0;
+  }
+
+  /* Test if a denoising task needs to run, also to prefilter passes for the native
+   * denoiser when we are not applying denoising to the combined image. */
+  bool need_denoising_task() const
+  {
+    return (use || (store_passes && type == DENOISER_NLM));
   }
 };
 
@@ -75,7 +117,7 @@ class AdaptiveSampling {
   int min_samples;
 };
 
-class DeviceTask : public Task {
+class DeviceTask {
  public:
   typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
   Type type;
@@ -98,8 +140,8 @@ class DeviceTask : public Task {
 
   explicit DeviceTask(Type type = RENDER);
 
-  int get_subtask_count(int num, int max_size = 0);
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0);
+  int get_subtask_count(int num, int max_size = 0) const;
+  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
 
   void update_progress(RenderTile *rtile, int pixel_samples = -1);
 
@@ -116,10 +158,6 @@ class DeviceTask : public Task {
   bool denoising_from_render;
   vector<int> denoising_frames;
 
-  bool denoising_do_filter;
-  bool denoising_use_optix;
-  bool denoising_write_passes;
-
   int pass_stride;
   int frame_stride;
   int target_pass_stride;
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 389268e1c2a..e0140996cf0 100644
--- a/intern/cycles/device/opencl/device_opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -23,6 +23,7 @@
 #  include "util/util_map.h"
 #  include "util/util_param.h"
 #  include "util/util_string.h"
+#  include "util/util_task.h"
 
 #  include "clew.h"
 
@@ -258,6 +259,8 @@ class OpenCLDevice : public Device {
   TaskPool load_required_kernel_task_pool;
   /* Task pool for optional kernels (feature kernels during foreground rendering) */
   TaskPool load_kernel_task_pool;
+  std::atomic<int> load_kernel_num_compiling;
+
   cl_context cxContext;
   cl_command_queue cqCommandQueue;
   cl_platform_id cpPlatform;
@@ -455,14 +458,6 @@ class OpenCLDevice : public Device {
 
   void denoise(RenderTile &tile, DenoisingTask &denoising);
 
-  class OpenCLDeviceTask : public DeviceTask {
-   public:
-    OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&OpenCLDevice::thread_run, device, this);
-    }
-  };
-
   int get_split_task_count(DeviceTask & /*task*/)
   {
     return 1;
@@ -470,7 +465,10 @@ class OpenCLDevice : public Device {
 
   void task_add(DeviceTask &task)
   {
-    task_pool.push(new OpenCLDeviceTask(this, task));
+    task_pool.push([=] {
+      DeviceTask task_copy = task;
+      thread_run(task_copy);
+    });
   }
 
   void task_wait()
@@ -483,7 +481,7 @@ class OpenCLDevice : public Device {
     task_pool.cancel();
   }
 
-  void thread_run(DeviceTask *task);
+  void thread_run(DeviceTask &task);
 
   virtual BVHLayoutMask get_bvh_layout_mask() const
   {
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index beb3174b111..8c94815b193 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -542,7 +542,7 @@ class OpenCLSplitKernel : public DeviceSplitKernel {
 
   virtual int2 split_kernel_global_size(device_memory &kg,
                                         device_memory &data,
-                                        DeviceTask * /*task*/)
+                                        DeviceTask & /*task*/)
   {
     cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
     /* Use small global size on CPU devices as it seems to be much faster. */
@@ -610,6 +610,7 @@ void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
 
 OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
     : Device(info, stats, profiler, background),
+      load_kernel_num_compiling(0),
       kernel_programs(this),
       preview_programs(this),
       memory_manager(this),
@@ -684,9 +685,9 @@ OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, b
 
 OpenCLDevice::~OpenCLDevice()
 {
-  task_pool.stop();
-  load_required_kernel_task_pool.stop();
-  load_kernel_task_pool.stop();
+  task_pool.cancel();
+  load_required_kernel_task_pool.cancel();
+  load_kernel_task_pool.cancel();
 
   memory_manager.free();
 
@@ -798,7 +799,11 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_feature
    * internally within a single process. */
   foreach (OpenCLProgram *program, programs) {
     if (!program->load()) {
-      load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+      load_kernel_num_compiling++;
+      load_kernel_task_pool.push([=] {
+        program->compile();
+        load_kernel_num_compiling--;
+      });
     }
   }
   return true;
@@ -868,7 +873,7 @@ bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requeste
      * Better to check on device level than per kernel as mixing preview and
      * non-preview kernels does not work due to different data types */
     if (use_preview_kernels) {
-      use_preview_kernels = !load_kernel_task_pool.finished();
+      use_preview_kernels = load_kernel_num_compiling.load() > 0;
     }
   }
   return split_kernel->load_kernels(requested_features);
@@ -895,7 +900,7 @@ DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
     return DEVICE_KERNEL_USING_FEATURE_KERNEL;
   }
 
-  bool other_kernels_finished = load_kernel_task_pool.finished();
+  bool other_kernels_finished = load_kernel_num_compiling.load() == 0;
   if (use_preview_kernels) {
     if (other_kernels_finished) {
       return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
@@ -1336,20 +1341,20 @@ void OpenCLDevice::flush_texture_buffers()
   memory_manager.alloc("texture_info", texture_info);
 }
 
-void OpenCLDevice::thread_run(DeviceTask *task)
+void OpenCLDevice::thread_run(DeviceTask &task)
 {
   flush_texture_buffers();
 
-  if (task->type == DeviceTask::RENDER) {
+  if (task.type == DeviceTask::RENDER) {
     RenderTile tile;
-    DenoisingTask denoising(this, *task);
+    DenoisingTask denoising(this, task);
 
     /* Allocate buffer for kernel globals */
     device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
     kgbuffer.alloc_to_device(1);
 
     /* Keep rendering tiles until done. */
-    while (task->acquire_tile(this, tile, task->tile_types)) {
+    while (task.acquire_tile(this, tile, task.tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         assert(tile.task == RenderTile::PATH_TRACE);
         scoped_timer timer(&tile.buffers->render_time);
@@ -1368,42 +1373,42 @@ void OpenCLDevice::thread_run(DeviceTask *task)
         clFinish(cqCommandQueue);
       }
       else if (tile.task == RenderTile::BAKE) {
-        bake(*task, tile);
+        bake(task, tile);
       }
       else if (tile.task == RenderTile::DENOISE) {
         tile.sample = tile.start_sample + tile.num_samples;
         denoise(tile, denoising);
-        task->update_progress(&tile, tile.w * tile.h);
+        task.update_progress(&tile, tile.w * tile.h);
       }
 
-      task->release_tile(tile);
+      task.release_tile(tile);
     }
 
     kgbuffer.free();
   }
-  else if (task->type == DeviceTask::SHADER) {
-    shader(*task);
+  else if (task.type == DeviceTask::SHADER) {
+    shader(task);
   }
-  else if (task->type == DeviceTask::FILM_CONVERT) {
-    film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+  else if (task.type == DeviceTask::FILM_CONVERT) {
+    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
   }
-  else if (task->type == DeviceTask::DENOISE_BUFFER) {
+  else if (task.type == DeviceTask::DENOISE_BUFFER) {
     RenderTile tile;
-    tile.x = task->x;
-    tile.y = task->y;
-    tile.w = task->w;
-    tile.h = task->h;
-    tile.buffer = task->buffer;
-    tile.sample = task->sample + task->num_samples;
-    tile.num_samples = task->num_samples;
-    tile.start_sample = task->sample;
-    tile.offset = task->offset;
-    tile.stride = task->stride;
-    tile.buffers = task->buffers;
-
-    DenoisingTask denoising(this, *task);
+    tile.x = task.x;
+    tile.y = task.y;
+    tile.w = task.w;
+    tile.h = task.h;
+    tile.buffer = task.buffer;
+    tile.sample = task.sample + task.num_samples;
+    tile.num_samples = task.num_samples;
+    tile.start_sample = task.sample;
+    tile.offset = task.offset;
+    tile.stride = task.stride;
+    tile.buffers = task.buffers;
+
+    DenoisingTask denoising(this, task);
     denoise(tile, denoising);
-    task->update_progress(&tile, tile.w * tile.h);
+    task.update_progress(&tile, tile.w * tile.h);
   }
 }
 
@@ -1937,10 +1942,8 @@ void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
   clFinish(cqCommandQueue);
 }
 
-string OpenCLDevice::kernel_build_options(const string *debug_src)
+static bool kernel_build_opencl_2(cl_device_id cdDevice)
 {
-  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
   /* Build with OpenCL 2.0 if available, this improves performance
    * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
    * Note that OpenCL selects the highest 1.x version by default,
@@ -1948,10 +1951,36 @@ string OpenCLDevice::kernel_build_options(const string *debug_src)
   int version_major, version_minor;
   if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
     if (version_major >= 2) {
-      build_options += "-cl-std=CL2.0 ";
+      /* This appears to trigger a driver bug in Radeon RX cards with certain
+       * driver version, so don't use OpenCL 2.0 for those. */
+      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
+      if (string_startswith(device_name, "Radeon RX 4") ||
+          string_startswith(device_name, "Radeon (TM) RX 4") ||
+          string_startswith(device_name, "Radeon RX 5") ||
+          string_startswith(device_name, "Radeon (TM) RX 5")) {
+        char version[256] = "";
+        int driver_major, driver_minor;
+        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
+          return !(driver_major == 3075 && driver_minor <= 12);
+        }
+      }
+
+      return true;
     }
   }
 
+  return false;
+}
+
+string OpenCLDevice::kernel_build_options(const string *debug_src)
+{
+  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
+
+  if (kernel_build_opencl_2(cdDevice)) {
+    build_options += "-cl-std=CL2.0 ";
+  }
+
   if (platform_name == "NVIDIA CUDA") {
     build_options +=
         "-D__KERNEL_OPENCL_NVIDIA__ "
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 2e839a616e9..7cc0d32d521 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -81,18 +81,6 @@ set(SRC_BVH_HEADERS
   bvh/bvh_types.h
   bvh/bvh_volume.h
   bvh/bvh_volume_all.h
-  bvh/qbvh_nodes.h
-  bvh/qbvh_shadow_all.h
-  bvh/qbvh_local.h
-  bvh/qbvh_traversal.h
-  bvh/qbvh_volume.h
-  bvh/qbvh_volume_all.h
-  bvh/obvh_nodes.h
-  bvh/obvh_shadow_all.h
-  bvh/obvh_local.h
-  bvh/obvh_traversal.h
-  bvh/obvh_volume.h
-  bvh/obvh_volume_all.h
   bvh/bvh_embree.h
 )
 
@@ -113,6 +101,8 @@ set(SRC_HEADERS
   kernel_id_passes.h
   kernel_jitter.h
   kernel_light.h
+  kernel_light_background.h
+  kernel_light_common.h
   kernel_math.h
   kernel_montecarlo.h
   kernel_passes.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 9b9df883b62..80b58f46329 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -35,14 +35,6 @@ CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPTIX__
 
-/* Common QBVH functions. */
-#  ifdef __QBVH__
-#    include "kernel/bvh/qbvh_nodes.h"
-#    ifdef __KERNEL_AVX2__
-#      include "kernel/bvh/obvh_nodes.h"
-#    endif
-#  endif
-
 /* Regular BVH traversal */
 
 #  include "kernel/bvh/bvh_nodes.h"
@@ -51,27 +43,21 @@ CCL_NAMESPACE_BEGIN
 #  define BVH_FUNCTION_FEATURES 0
 #  include "kernel/bvh/bvh_traversal.h"
 
-#  if defined(__INSTANCING__)
-#    define BVH_FUNCTION_NAME bvh_intersect_instancing
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#    include "kernel/bvh/bvh_traversal.h"
-#  endif
-
 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_hair
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/bvh_traversal.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_motion
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION
+#    define BVH_FUNCTION_FEATURES BVH_MOTION
 #    include "kernel/bvh/bvh_traversal.h"
 #  endif
 
 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_hair_motion
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION
+#    define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION
 #    include "kernel/bvh/bvh_traversal.h"
 #  endif
 
@@ -96,15 +82,9 @@ CCL_NAMESPACE_BEGIN
 #    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/bvh_volume.h"
 
-#    if defined(__INSTANCING__)
-#      define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
-#      include "kernel/bvh/bvh_volume.h"
-#    endif
-
 #    if defined(__OBJECT_MOTION__)
 #      define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
 #      include "kernel/bvh/bvh_volume.h"
 #    endif
 #  endif /* __VOLUME__ */
@@ -116,27 +96,21 @@ CCL_NAMESPACE_BEGIN
 #    define BVH_FUNCTION_FEATURES 0
 #    include "kernel/bvh/bvh_shadow_all.h"
 
-#    if defined(__INSTANCING__)
-#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#      include "kernel/bvh/bvh_shadow_all.h"
-#    endif
-
 #    if defined(__HAIR__)
 #      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
+#      define BVH_FUNCTION_FEATURES BVH_HAIR
 #      include "kernel/bvh/bvh_shadow_all.h"
 #    endif
 
 #    if defined(__OBJECT_MOTION__)
 #      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION
+#      define BVH_FUNCTION_FEATURES BVH_MOTION
 #      include "kernel/bvh/bvh_shadow_all.h"
 #    endif
 
 #    if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION
+#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION
 #      include "kernel/bvh/bvh_shadow_all.h"
 #    endif
 #  endif /* __SHADOW_RECORD_ALL__ */
@@ -148,15 +122,9 @@ CCL_NAMESPACE_BEGIN
 #    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/bvh_volume_all.h"
 
-#    if defined(__INSTANCING__)
-#      define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
-#      include "kernel/bvh/bvh_volume_all.h"
-#    endif
-
 #    if defined(__OBJECT_MOTION__)
 #      define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#      define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
 #      include "kernel/bvh/bvh_volume_all.h"
 #    endif
 #  endif /* __VOLUME_RECORD_ALL__ */
@@ -264,21 +232,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
   }
 #  endif /* __HAIR__ */
 
-#  ifdef __KERNEL_CPU__
-#    ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing) {
-    return bvh_intersect_instancing(kg, ray, isect, visibility);
-  }
-#    endif /* __INSTANCING__ */
-  return bvh_intersect(kg, ray, isect, visibility);
-#  else    /* __KERNEL_CPU__ */
-#    ifdef __INSTANCING__
-  return bvh_intersect_instancing(kg, ray, isect, visibility);
-#    else
   return bvh_intersect(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-#  endif   /* __KERNEL_CPU__ */
-#endif     /* __KERNEL_OPTIX__ */
+#endif   /* __KERNEL_OPTIX__ */
 }
 
 #ifdef __BVH_LOCAL__
@@ -476,21 +431,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
   }
 #    endif /* __HAIR__ */
 
-#    ifdef __KERNEL_CPU__
-#      ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing) {
-    return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits);
-  }
-#      endif /* __INSTANCING__ */
   return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
-#    else
-#      ifdef __INSTANCING__
-  return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits);
-#      else
-  return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
-#      endif /* __INSTANCING__ */
-#    endif   /* __KERNEL_CPU__ */
-#  endif     /* __KERNEL_OPTIX__ */
+#  endif   /* __KERNEL_OPTIX__ */
 }
 #endif /* __SHADOW_RECORD_ALL__ */
 
@@ -548,21 +490,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
   }
 #    endif /* __OBJECT_MOTION__ */
 
-#    ifdef __KERNEL_CPU__
-#      ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing) {
-    return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-  }
-#      endif /* __INSTANCING__ */
   return bvh_intersect_volume(kg, ray, isect, visibility);
-#    else    /* __KERNEL_CPU__ */
-#      ifdef __INSTANCING__
-  return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#      else
-  return bvh_intersect_volume(kg, ray, isect, visibility);
-#      endif /* __INSTANCING__ */
-#    endif   /* __KERNEL_CPU__ */
-#  endif     /* __KERNEL_OPTIX__ */
+#  endif   /* __KERNEL_OPTIX__ */
 }
 #endif /* __VOLUME__ */
 
@@ -599,11 +528,6 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
   }
 #  endif /* __OBJECT_MOTION__ */
 
-#  ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing) {
-    return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
-  }
-#  endif /* __INSTANCING__ */
   return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
 }
 #endif /* __VOLUME_RECORD_ALL__ */
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 7a069ef1108..4006c9c1632 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_local.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_local.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -88,26 +81,6 @@ ccl_device_inline
     object = local_object;
   }
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect_t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
   /* traversal loop */
   do {
     do {
@@ -117,33 +90,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect_t,
                                        node_addr,
                                        PATH_RAY_ALL_VISIBILITY,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       PATH_RAY_ALL_VISIBILITY,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -247,20 +203,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          uint *lcg_state,
                                          int max_hits)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index db598d1c7fa..5367bdb633c 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -28,7 +28,6 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
   return space;
 }
 
-#if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
                                                       const float3 P,
                                                       const float3 idir,
@@ -39,9 +38,9 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 {
 
   /* fetch node data */
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
   float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-#  endif
+#endif
   float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
   float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
   float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
@@ -68,13 +67,13 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
   dist[0] = c0min;
   dist[1] = c1min;
 
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
   /* this visibility test gives a 5% performance hit, how to solve? */
   return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
          (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-#  else
+#else
   return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0);
-#  endif
+#endif
 }
 
 ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
@@ -113,21 +112,21 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                         float dist[2])
 {
   int mask = 0;
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
   float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-#  endif
+#endif
   if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
     if ((__float_as_uint(cnodes.x) & visibility))
-#  endif
+#endif
     {
       mask |= 1;
     }
   }
   if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
     if ((__float_as_uint(cnodes.y) & visibility))
-#  endif
+#endif
     {
       mask |= 2;
     }
@@ -152,125 +151,3 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
     return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist);
   }
 }
-
-#else /* !defined(__KERNEL_SSE2__) */
-
-int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                      const float3 &P,
-                                                      const float3 &dir,
-                                                      const ssef &tsplat,
-                                                      const ssef Psplat[3],
-                                                      const ssef idirsplat[3],
-                                                      const shuffle_swap_t shufflexyz[3],
-                                                      const int node_addr,
-                                                      const uint visibility,
-                                                      float dist[2])
-{
-  /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-
-  /* fetch node data */
-  const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr;
-
-  /* intersect ray against child nodes */
-  const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-  const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-  const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-  /* calculate { c0min, c1min, -c0max, -c1max} */
-  ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-  const ssef tminmax = minmax ^ pn;
-  const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-  dist[0] = tminmax[0];
-  dist[1] = tminmax[1];
-
-  int mask = movemask(lrhit);
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-  return cmask;
-#  else
-  return mask & 3;
-#  endif
-}
-
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 dir,
-                                                        const ssef &isect_near,
-                                                        const ssef &isect_far,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
-{
-  Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
-  Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
-
-  float3 aligned_dir0 = transform_direction(&space0, dir),
-         aligned_dir1 = transform_direction(&space1, dir);
-  float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P);
-  float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-         nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-  ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f),
-       lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f),
-       lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f);
-
-  ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-       upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-       upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-  ssef tnear_x = min(lower_x, upper_x);
-  ssef tnear_y = min(lower_y, upper_y);
-  ssef tnear_z = min(lower_z, upper_z);
-  ssef tfar_x = max(lower_x, upper_x);
-  ssef tfar_y = max(lower_y, upper_y);
-  ssef tfar_z = max(lower_z, upper_z);
-
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  sseb vmask = tnear <= tfar;
-  dist[0] = tnear.f[0];
-  dist[1] = tnear.f[1];
-
-  int mask = (int)movemask(vmask);
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-  return cmask;
-#  else
-  return mask & 3;
-#  endif
-}
-
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                              const float3 &P,
-                                              const float3 &dir,
-                                              const ssef &isect_near,
-                                              const ssef &isect_far,
-                                              const ssef &tsplat,
-                                              const ssef Psplat[3],
-                                              const ssef idirsplat[3],
-                                              const shuffle_swap_t shufflexyz[3],
-                                              const int node_addr,
-                                              const uint visibility,
-                                              float dist[2])
-{
-  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return bvh_unaligned_node_intersect(
-        kg, P, dir, isect_near, isect_far, node_addr, visibility, dist);
-  }
-  else {
-    return bvh_aligned_node_intersect(
-        kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist);
-  }
-}
-#endif /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 268bb149970..dccd257d2de 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_shadow_all.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_shadow_all.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_HAIR: hair curve rendering
  * BVH_MOTION: motion blur rendering
  */
@@ -76,33 +68,11 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-#if BVH_FEATURE(BVH_INSTANCING)
   int num_hits_in_instance = 0;
-#endif
 
   *num_hits = 0;
   isect_array->t = tmax;
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect_t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif /* __KERNEL_SSE2__ */
-
   /* traversal loop */
   do {
     do {
@@ -112,33 +82,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect_t,
                                        node_addr,
                                        visibility,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       visibility,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -174,9 +127,7 @@ ccl_device_inline
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
           const uint p_type = type & PRIMITIVE_ALL;
@@ -207,31 +158,13 @@ ccl_device_inline
               }
 #endif
 #if BVH_FEATURE(BVH_HAIR)
-              case PRIMITIVE_CURVE:
-              case PRIMITIVE_MOTION_CURVE: {
+              case PRIMITIVE_CURVE_THICK:
+              case PRIMITIVE_MOTION_CURVE_THICK:
+              case PRIMITIVE_CURVE_RIBBON:
+              case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect_array,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect_array,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type);
-                }
+                hit = curve_intersect(
+                    kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
                 break;
               }
 #endif
@@ -276,9 +209,7 @@ ccl_device_inline
               /* move on to next entry in intersections array */
               isect_array++;
               (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
               num_hits_in_instance++;
-#endif
 
               isect_array->t = isect_t;
             }
@@ -286,32 +217,19 @@ ccl_device_inline
             prim_addr++;
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
           isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
+#else
           isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
+#endif
 
           num_hits_in_instance = 0;
           isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-          Psplat[0] = ssef(P.x);
-          Psplat[1] = ssef(P.y);
-          Psplat[2] = ssef(P.z);
-
-          tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-          tfar = ssef(isect_t);
-#    endif
-          gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
           traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -319,10 +237,8 @@ ccl_device_inline
           node_addr = kernel_tex_fetch(__object_node, object);
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
@@ -330,11 +246,11 @@ ccl_device_inline
       if (num_hits_in_instance) {
         float t_fac;
 
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
+#else
         bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
+#endif
 
         /* scale isect->t to adjust for instancing */
         for (int i = 0; i < num_hits_in_instance; i++) {
@@ -342,33 +258,20 @@ ccl_device_inline
         }
       }
       else {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
+#else
         bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
+#endif
       }
 
       isect_t = tmax;
       isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect_t);
-#    endif
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_INSTANCING) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return false;
@@ -381,20 +284,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint max_hits,
                                          uint *num_hits)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 18afc6ae4eb..8b2699ab807 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_traversal.h"
-#endif
-#ifdef __KERNEL_AVX2__
-#  include "kernel/bvh/obvh_traversal.h"
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_HAIR: hair curve rendering
  * BVH_MOTION: motion blur rendering
  */
@@ -77,26 +69,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
   BVH_DEBUG_INIT();
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect->t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
   /* traversal loop */
   do {
     do {
@@ -106,37 +78,18 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         {
           traverse_mask = NODE_INTERSECT(kg,
                                          P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                          dir,
-#  endif
+#endif
                                          idir,
                                          isect->t,
                                          node_addr,
                                          visibility,
                                          dist);
         }
-#else  // __KERNEL_SSE2__
-        {
-          traverse_mask = NODE_INTERSECT(kg,
-                                         P,
-                                         dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                         tnear,
-                                         tfar,
-#  endif
-                                         tsplat,
-                                         Psplat,
-                                         idirsplat,
-                                         shufflexyz,
-                                         node_addr,
-                                         visibility,
-                                         dist);
-        }
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -173,9 +126,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
 
@@ -191,17 +142,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
                 if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
-#if defined(__KERNEL_SSE2__)
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
-                  tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-                  tfar = ssef(isect->t);
-#  endif
-#else
-                if (visibility & PATH_RAY_SHADOW_OPAQUE)
-                  return true;
-#endif
                 }
               }
               break;
@@ -214,51 +156,28 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                 if (motion_triangle_intersect(
                         kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
-#  if defined(__KERNEL_SSE2__)
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
-                    return true;
-                  tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-                  tfar = ssef(isect->t);
-#    endif
-#  else
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
-#  endif
                 }
               }
               break;
             }
 #endif /* BVH_FEATURE(BVH_MOTION) */
 #if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE:
-            case PRIMITIVE_MOTION_CURVE: {
+            case PRIMITIVE_CURVE_THICK:
+            case PRIMITIVE_MOTION_CURVE_THICK:
+            case PRIMITIVE_CURVE_RIBBON:
+            case PRIMITIVE_MOTION_CURVE_RIBBON: {
               for (; prim_addr < prim_addr2; prim_addr++) {
                 BVH_DEBUG_NEXT_INTERSECTION();
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                bool hit;
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(
-                      kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
-                }
-                else {
-                  hit = curve_intersect(
-                      kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
-                }
+                const bool hit = curve_intersect(
+                    kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
                 if (hit) {
                   /* shadow ray early termination */
-#  if defined(__KERNEL_SSE2__)
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
-                  tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-                  tfar = ssef(isect->t);
-#    endif
-#  else
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
-                    return true;
-#  endif
                 }
               }
               break;
@@ -266,30 +185,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #endif /* BVH_FEATURE(BVH_HAIR) */
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
           isect->t = bvh_instance_motion_push(
               kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
           isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-          Psplat[0] = ssef(P.x);
-          Psplat[1] = ssef(P.y);
-          Psplat[2] = ssef(P.z);
-
-          tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-          tfar = ssef(isect->t);
-#    endif
-
-          gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -300,38 +205,22 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
           BVH_DEBUG_NEXT_INSTANCE();
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
       /* instance pop */
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
       isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
       isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect->t);
-#    endif
-
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_INSTANCING) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return (isect->prim != PRIM_NONE);
@@ -342,20 +231,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          Intersection *isect,
                                          const uint visibility)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect, visibility);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect, visibility);
-#endif /* __QBVH__ */
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 84dc0dbaef5..b173568266b 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -31,13 +31,10 @@ CCL_NAMESPACE_BEGIN
 
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
-#define BVH_QSTACK_SIZE 384
-#define BVH_OSTACK_SIZE 768
 /* BVH intersection function variations */
 
-#define BVH_INSTANCING 1
-#define BVH_MOTION 2
-#define BVH_HAIR 4
+#define BVH_MOTION 1
+#define BVH_HAIR 2
 
 #define BVH_NAME_JOIN(x, y) x##_##y
 #define BVH_NAME_EVAL(x, y) BVH_NAME_JOIN(x, y)
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index c83b0d783f4..1f2ea47269b 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_volume.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_volume.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_MOTION: motion blur rendering
  */
 
@@ -79,26 +71,6 @@ ccl_device_inline
   isect->prim = PRIM_NONE;
   isect->object = OBJECT_NONE;
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect->t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
   /* traversal loop */
   do {
     do {
@@ -108,33 +80,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect->t,
                                        node_addr,
                                        visibility,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       visibility,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -170,9 +125,7 @@ ccl_device_inline
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
 
@@ -222,31 +175,17 @@ ccl_device_inline
             }
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
             isect->t = bvh_instance_motion_push(
                 kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
             isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-            Psplat[0] = ssef(P.x);
-            Psplat[1] = ssef(P.y);
-            Psplat[2] = ssef(P.z);
-
-            tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-            tfar = ssef(isect->t);
-#    endif
-
-            gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
             ++stack_ptr;
             kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -262,38 +201,22 @@ ccl_device_inline
           }
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
       /* instance pop */
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
       isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
       isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect->t);
-#    endif
-
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_MOTION) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return (isect->prim != PRIM_NONE);
@@ -304,20 +227,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          Intersection *isect,
                                          const uint visibility)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect, visibility);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect, visibility);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index ae8c4d12e8a..a8664cc4331 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_volume_all.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_volume_all.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_MOTION: motion blur rendering
  */
 
@@ -76,33 +68,11 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-#if BVH_FEATURE(BVH_INSTANCING)
   int num_hits_in_instance = 0;
-#endif
 
   uint num_hits = 0;
   isect_array->t = tmax;
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect_t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif /* __KERNEL_SSE2__ */
-
   /* traversal loop */
   do {
     do {
@@ -112,33 +82,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect_t,
                                        node_addr,
                                        visibility,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       visibility,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -174,9 +127,7 @@ ccl_device_inline
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
           bool hit;
@@ -204,25 +155,21 @@ ccl_device_inline
                   /* Move on to next entry in intersections array. */
                   isect_array++;
                   num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
                   num_hits_in_instance++;
-#endif
                   isect_array->t = isect_t;
                   if (num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
                     if (object != OBJECT_NONE) {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
                       float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
+#else
                       Transform itfm = object_fetch_transform(
                           kg, object, OBJECT_INVERSE_TRANSFORM);
                       float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
+#endif
                       for (int i = 0; i < num_hits_in_instance; i++) {
                         (isect_array - i - 1)->t *= t_fac;
                       }
                     }
-#endif /* BVH_FEATURE(BVH_INSTANCING) */
                     return num_hits;
                   }
                 }
@@ -248,25 +195,21 @@ ccl_device_inline
                   /* Move on to next entry in intersections array. */
                   isect_array++;
                   num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
                   num_hits_in_instance++;
-#  endif
                   isect_array->t = isect_t;
                   if (num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
                     if (object != OBJECT_NONE) {
-#    if BVH_FEATURE(BVH_MOTION)
+#  if BVH_FEATURE(BVH_MOTION)
                       float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
+#  else
                       Transform itfm = object_fetch_transform(
                           kg, object, OBJECT_INVERSE_TRANSFORM);
                       float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
+#  endif
                       for (int i = 0; i < num_hits_in_instance; i++) {
                         (isect_array - i - 1)->t *= t_fac;
                       }
                     }
-#  endif /* BVH_FEATURE(BVH_INSTANCING) */
                     return num_hits;
                   }
                 }
@@ -279,35 +222,21 @@ ccl_device_inline
             }
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
             isect_t = bvh_instance_motion_push(
                 kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
+#else
             isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
+#endif
 
             num_hits_in_instance = 0;
             isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-            Psplat[0] = ssef(P.x);
-            Psplat[1] = ssef(P.y);
-            Psplat[2] = ssef(P.z);
-
-            tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-            tfar = ssef(isect_t);
-#    endif
-
-            gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
             ++stack_ptr;
             kernel_assert(stack_ptr < BVH_STACK_SIZE);
             traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -322,55 +251,39 @@ ccl_device_inline
           }
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
       if (num_hits_in_instance) {
         float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
+#else
         bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
+#endif
         /* Scale isect->t to adjust for instancing. */
         for (int i = 0; i < num_hits_in_instance; i++) {
           (isect_array - i - 1)->t *= t_fac;
         }
       }
       else {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
+#else
         bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
+#endif
       }
 
       isect_t = tmax;
       isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect_t);
-#    endif
-
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_INSTANCING) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return num_hits;
@@ -382,20 +295,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint max_hits,
                                          const uint visibility)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, max_hits, visibility);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, max_hits, visibility);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility);
-  }
-  kernel_assert(!"Should not happen");
-  return 0;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
deleted file mode 100644
index e6bb548bc5b..00000000000
--- a/intern/cycles/kernel/bvh/obvh_local.h
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             LocalIntersection *local_isect,
-                                             int local_object,
-                                             uint *lcg_state,
-                                             int max_hits)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_tex_fetch(__object_node, local_object);
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = ray->t;
-
-  if (local_isect != NULL) {
-    local_isect->num_hits = 0;
-  }
-  kernel_assert((local_isect == NULL) == (max_hits == 0));
-
-  const int object_flag = kernel_tex_fetch(__object_flag, local_object);
-  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#if BVH_FEATURE(BVH_MOTION)
-    Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
-#endif
-    object = local_object;
-  }
-
-  avxf tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-        int prim_addr = __float_as_int(leaf.x);
-
-        int prim_addr2 = __float_as_int(leaf.y);
-        const uint type = __float_as_int(leaf.w);
-
-        /* Pop. */
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-
-        /* Primitive intersection. */
-        switch (type & PRIMITIVE_ALL) {
-          case PRIMITIVE_TRIANGLE: {
-            /* Intersect ray against primitive, */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (triangle_intersect_local(kg,
-                                           local_isect,
-                                           P,
-                                           dir,
-                                           object,
-                                           local_object,
-                                           prim_addr,
-                                           isect_t,
-                                           lcg_state,
-                                           max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#if BVH_FEATURE(BVH_MOTION)
-          case PRIMITIVE_MOTION_TRIANGLE: {
-            /* Intersect ray against primitive. */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (motion_triangle_intersect_local(kg,
-                                                  local_isect,
-                                                  P,
-                                                  dir,
-                                                  ray->time,
-                                                  object,
-                                                  local_object,
-                                                  prim_addr,
-                                                  isect_t,
-                                                  lcg_state,
-                                                  max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#endif
-          default:
-            break;
-        }
-      }
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h
deleted file mode 100644
index e5c935b75ed..00000000000
--- a/intern/cycles/kernel/bvh/obvh_nodes.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Aligned nodes intersection AVX code is adopted from Embree,
- */
-
-struct OBVHStackItem {
-  int addr;
-  float dist;
-};
-
-ccl_device_inline void obvh_near_far_idx_calc(const float3 &idir,
-                                              int *ccl_restrict near_x,
-                                              int *ccl_restrict near_y,
-                                              int *ccl_restrict near_z,
-                                              int *ccl_restrict far_x,
-                                              int *ccl_restrict far_y,
-                                              int *ccl_restrict far_z)
-
-{
-#ifdef __KERNEL_SSE__
-  *near_x = 0;
-  *far_x = 1;
-  *near_y = 2;
-  *far_y = 3;
-  *near_z = 4;
-  *far_z = 5;
-
-  const size_t mask = movemask(ssef(idir.m128));
-
-  const int mask_x = mask & 1;
-  const int mask_y = (mask & 2) >> 1;
-  const int mask_z = (mask & 4) >> 2;
-
-  *near_x += mask_x;
-  *far_x -= mask_x;
-  *near_y += mask_y;
-  *far_y -= mask_y;
-  *near_z += mask_z;
-  *far_z -= mask_z;
-#else
-  if (idir.x >= 0.0f) {
-    *near_x = 0;
-    *far_x = 1;
-  }
-  else {
-    *near_x = 1;
-    *far_x = 0;
-  }
-  if (idir.y >= 0.0f) {
-    *near_y = 2;
-    *far_y = 3;
-  }
-  else {
-    *near_y = 3;
-    *far_y = 2;
-  }
-  if (idir.z >= 0.0f) {
-    *near_z = 4;
-    *far_z = 5;
-  }
-  else {
-    *near_z = 5;
-    *far_z = 4;
-  }
-#endif
-}
-
-ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, OBVHStackItem *ccl_restrict b)
-{
-  OBVHStackItem tmp = *a;
-  *a = *b;
-  *b = tmp;
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3)
-{
-  if (s2->dist < s1->dist) {
-    obvh_item_swap(s2, s1);
-  }
-  if (s3->dist < s2->dist) {
-    obvh_item_swap(s3, s2);
-  }
-  if (s2->dist < s1->dist) {
-    obvh_item_swap(s2, s1);
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4)
-{
-  if (s2->dist < s1->dist) {
-    obvh_item_swap(s2, s1);
-  }
-  if (s4->dist < s3->dist) {
-    obvh_item_swap(s4, s3);
-  }
-  if (s3->dist < s1->dist) {
-    obvh_item_swap(s3, s1);
-  }
-  if (s4->dist < s2->dist) {
-    obvh_item_swap(s4, s2);
-  }
-  if (s3->dist < s2->dist) {
-    obvh_item_swap(s3, s2);
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5)
-{
-  obvh_stack_sort(s1, s2, s3, s4);
-  if (s5->dist < s4->dist) {
-    obvh_item_swap(s4, s5);
-    if (s4->dist < s3->dist) {
-      obvh_item_swap(s3, s4);
-      if (s3->dist < s2->dist) {
-        obvh_item_swap(s2, s3);
-        if (s2->dist < s1->dist) {
-          obvh_item_swap(s1, s2);
-        }
-      }
-    }
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5,
-                                       OBVHStackItem *ccl_restrict s6)
-{
-  obvh_stack_sort(s1, s2, s3, s4, s5);
-  if (s6->dist < s5->dist) {
-    obvh_item_swap(s5, s6);
-    if (s5->dist < s4->dist) {
-      obvh_item_swap(s4, s5);
-      if (s4->dist < s3->dist) {
-        obvh_item_swap(s3, s4);
-        if (s3->dist < s2->dist) {
-          obvh_item_swap(s2, s3);
-          if (s2->dist < s1->dist) {
-            obvh_item_swap(s1, s2);
-          }
-        }
-      }
-    }
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5,
-                                       OBVHStackItem *ccl_restrict s6,
-                                       OBVHStackItem *ccl_restrict s7)
-{
-  obvh_stack_sort(s1, s2, s3, s4, s5, s6);
-  if (s7->dist < s6->dist) {
-    obvh_item_swap(s6, s7);
-    if (s6->dist < s5->dist) {
-      obvh_item_swap(s5, s6);
-      if (s5->dist < s4->dist) {
-        obvh_item_swap(s4, s5);
-        if (s4->dist < s3->dist) {
-          obvh_item_swap(s3, s4);
-          if (s3->dist < s2->dist) {
-            obvh_item_swap(s2, s3);
-            if (s2->dist < s1->dist) {
-              obvh_item_swap(s1, s2);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5,
-                                       OBVHStackItem *ccl_restrict s6,
-                                       OBVHStackItem *ccl_restrict s7,
-                                       OBVHStackItem *ccl_restrict s8)
-{
-  obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
-  if (s8->dist < s7->dist) {
-    obvh_item_swap(s7, s8);
-    if (s7->dist < s6->dist) {
-      obvh_item_swap(s6, s7);
-      if (s6->dist < s5->dist) {
-        obvh_item_swap(s5, s6);
-        if (s5->dist < s4->dist) {
-          obvh_item_swap(s4, s5);
-          if (s4->dist < s3->dist) {
-            obvh_item_swap(s3, s4);
-            if (s3->dist < s2->dist) {
-              obvh_item_swap(s2, s3);
-              if (s2->dist < s1->dist) {
-                obvh_item_swap(s1, s2);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-/* Axis-aligned nodes intersection */
-
-ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                                  const avxf &isect_near,
-                                                  const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                  const avx3f &org_idir,
-#else
-                                                  const avx3f &org,
-#endif
-                                                  const avx3f &idir,
-                                                  const int near_x,
-                                                  const int near_y,
-                                                  const int near_z,
-                                                  const int far_x,
-                                                  const int far_y,
-                                                  const int far_z,
-                                                  const int node_addr,
-                                                  avxf *ccl_restrict dist)
-{
-  const int offset = node_addr + 2;
-#ifdef __KERNEL_AVX2__
-  const avxf tnear_x = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, org_idir.x);
-  const avxf tnear_y = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, org_idir.y);
-  const avxf tnear_z = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, org_idir.z);
-  const avxf tfar_x = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, org_idir.x);
-  const avxf tfar_y = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, org_idir.y);
-  const avxf tfar_z = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, org_idir.z);
-
-  const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-  const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
-  const avxb vmask = tnear <= tfar;
-  int mask = (int)movemask(vmask);
-  *dist = tnear;
-  return mask;
-#else
-  return 0;
-#endif
-}
-
-/* Unaligned nodes intersection */
-
-ccl_device_inline int obvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                                    const avxf &isect_near,
-                                                    const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                    const avx3f &org_idir,
-#endif
-                                                    const avx3f &org,
-                                                    const avx3f &dir,
-                                                    const avx3f &idir,
-                                                    const int near_x,
-                                                    const int near_y,
-                                                    const int near_z,
-                                                    const int far_x,
-                                                    const int far_y,
-                                                    const int far_z,
-                                                    const int node_addr,
-                                                    avxf *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2);
-  const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4);
-  const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6);
-
-  const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8);
-  const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10);
-  const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12);
-
-  const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14);
-  const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16);
-  const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18);
-
-  const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20);
-  const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22);
-  const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24);
-
-  const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
-             aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
-             aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
-
-  const avxf aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x,
-             aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y,
-             aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z;
-
-  const avxf neg_one(-1.0f);
-  const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
-             nrdir_z = neg_one / aligned_dir_z;
-
-  const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
-             tlower_z = aligned_P_z * nrdir_z;
-
-  const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
-             tupper_z = tlower_z - nrdir_z;
-
-  const avxf tnear_x = min(tlower_x, tupper_x);
-  const avxf tnear_y = min(tlower_y, tupper_y);
-  const avxf tnear_z = min(tlower_z, tupper_z);
-  const avxf tfar_x = max(tlower_x, tupper_x);
-  const avxf tfar_y = max(tlower_y, tupper_y);
-  const avxf tfar_z = max(tlower_z, tupper_z);
-  const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const avxb vmask = tnear <= tfar;
-  *dist = tnear;
-  return movemask(vmask);
-}
-
-/* Intersectors wrappers.
- *
- * They'll check node type and call appropriate intersection code.
- */
-
-ccl_device_inline int obvh_node_intersect(KernelGlobals *ccl_restrict kg,
-                                          const avxf &isect_near,
-                                          const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                          const avx3f &org_idir,
-#endif
-                                          const avx3f &org,
-                                          const avx3f &dir,
-                                          const avx3f &idir,
-                                          const int near_x,
-                                          const int near_y,
-                                          const int near_z,
-                                          const int far_x,
-                                          const int far_y,
-                                          const int far_z,
-                                          const int node_addr,
-                                          avxf *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return obvh_unaligned_node_intersect(kg,
-                                         isect_near,
-                                         isect_far,
-#ifdef __KERNEL_AVX2__
-                                         org_idir,
-#endif
-                                         org,
-                                         dir,
-                                         idir,
-                                         near_x,
-                                         near_y,
-                                         near_z,
-                                         far_x,
-                                         far_y,
-                                         far_z,
-                                         node_addr,
-                                         dist);
-  }
-  else {
-    return obvh_aligned_node_intersect(kg,
-                                       isect_near,
-                                       isect_far,
-#ifdef __KERNEL_AVX2__
-                                       org_idir,
-#else
-                                       org,
-#endif
-                                       idir,
-                                       near_x,
-                                       near_y,
-                                       near_z,
-                                       far_x,
-                                       far_y,
-                                       far_z,
-                                       node_addr,
-                                       dist);
-  }
-}
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
deleted file mode 100644
index b7ab75b723c..00000000000
--- a/intern/cycles/kernel/bvh/obvh_shadow_all.h
+++ /dev/null
@@ -1,664 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const int skip_object,
-                                             const uint max_hits,
-                                             uint *num_hits)
-{
-  /* TODO(sergey):
-   *  - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  *num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  avxf tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (false
-#ifdef __VISIBILITY_FLAG__
-            || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
-#endif
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        //#if !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          if (p_type == PRIMITIVE_TRIANGLE) {
-            int prim_count = prim_addr2 - prim_addr;
-            if (prim_count < 3) {
-              while (prim_addr < prim_addr2) {
-                kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) ==
-                              p_type);
-                int hit = triangle_intersect(
-                    kg, isect_array, P, dir, PATH_RAY_SHADOW, object, prim_addr);
-                /* Shadow ray early termination. */
-                if (hit) {
-                  /* detect if this surface has a shader with transparent shadows */
-
-                  /* todo: optimize so primitive visibility flag indicates if
-                   * the primitive has a transparent shadow shader? */
-                  int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-                  int shader = 0;
-
-#ifdef __HAIR__
-                  if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-                  {
-                    shader = kernel_tex_fetch(__tri_shader, prim);
-                  }
-#ifdef __HAIR__
-                  else {
-                    float4 str = kernel_tex_fetch(__curves, prim);
-                    shader = __float_as_int(str.z);
-                  }
-#endif
-                  int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-                  /* if no transparent shadows, all light is blocked */
-                  if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-                    return true;
-                  }
-                  /* if maximum number of hits reached, block all light */
-                  else if (*num_hits == max_hits) {
-                    return true;
-                  }
-
-                  /* move on to next entry in intersections array */
-                  isect_array++;
-                  (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#endif
-
-                  isect_array->t = isect_t;
-                }
-
-                prim_addr++;
-              }  // while
-            }
-            else {
-              kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) ==
-                            p_type);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-              int *nhiptr = &num_hits_in_instance;
-#else
-            int nhi = 0;
-            int *nhiptr = &nhi;
-#endif
-
-              int result = triangle_intersect8(kg,
-                                               &isect_array,
-                                               P,
-                                               dir,
-                                               PATH_RAY_SHADOW,
-                                               object,
-                                               prim_addr,
-                                               prim_count,
-                                               num_hits,
-                                               max_hits,
-                                               nhiptr,
-                                               isect_t);
-              if (result == 2) {
-                return true;
-              }
-            }  // prim_count
-          }    // PRIMITIVE_TRIANGLE
-          else {
-            while (prim_addr < prim_addr2) {
-              kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
-#ifdef __SHADOW_TRICKS__
-              uint tri_object = (object == OBJECT_NONE) ?
-                                    kernel_tex_fetch(__prim_object, prim_addr) :
-                                    object;
-              if (tri_object == skip_object) {
-                ++prim_addr;
-                continue;
-              }
-#endif
-
-              bool hit;
-
-              /* todo: specialized intersect functions which don't fill in
-               * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
-               * might give a few % performance improvement */
-
-              switch (p_type) {
-
-#if BVH_FEATURE(BVH_MOTION)
-                case PRIMITIVE_MOTION_TRIANGLE: {
-                  hit = motion_triangle_intersect(
-                      kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, prim_addr);
-                  break;
-                }
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                case PRIMITIVE_CURVE:
-                case PRIMITIVE_MOTION_CURVE: {
-                  const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                  if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                    hit = cardinal_curve_intersect(kg,
-                                                   isect_array,
-                                                   P,
-                                                   dir,
-                                                   PATH_RAY_SHADOW,
-                                                   object,
-                                                   prim_addr,
-                                                   ray->time,
-                                                   curve_type);
-                  }
-                  else {
-                    hit = curve_intersect(kg,
-                                          isect_array,
-                                          P,
-                                          dir,
-                                          PATH_RAY_SHADOW,
-                                          object,
-                                          prim_addr,
-                                          ray->time,
-                                          curve_type);
-                  }
-                  break;
-                }
-#endif
-                default: {
-                  hit = false;
-                  break;
-                }
-              }
-
-              /* Shadow ray early termination. */
-              if (hit) {
-                /* detect if this surface has a shader with transparent shadows */
-
-                /* todo: optimize so primitive visibility flag indicates if
-                 * the primitive has a transparent shadow shader? */
-                int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-                int shader = 0;
-
-#ifdef __HAIR__
-                if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-                {
-                  shader = kernel_tex_fetch(__tri_shader, prim);
-                }
-#ifdef __HAIR__
-                else {
-                  float4 str = kernel_tex_fetch(__curves, prim);
-                  shader = __float_as_int(str.z);
-                }
-#endif
-                int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-                /* if no transparent shadows, all light is blocked */
-                if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-                  return true;
-                }
-                /* if maximum number of hits reached, block all light */
-                else if (*num_hits == max_hits) {
-                  return true;
-                }
-
-                /* move on to next entry in intersections array */
-                isect_array++;
-                (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                num_hits_in_instance++;
-#endif
-
-                isect_array->t = isect_t;
-              }
-
-              prim_addr++;
-            }  // while prim
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
-
-          obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = avxf(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-          idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
deleted file mode 100644
index 9095233f8b6..00000000000
--- a/intern/cycles/kernel/bvh/obvh_traversal.h
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-  traversal_stack[0].dist = -FLT_MAX;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-  float node_dist = -FLT_MAX;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  BVH_DEBUG_INIT();
-  avxf tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (UNLIKELY(node_dist > isect->t)
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-#ifdef __VISIBILITY_FLAG__
-            || (__float_as_uint(inodes.x) & visibility) == 0
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        int child_mask;
-        avxf dist;
-
-        BVH_DEBUG_NEXT_NODE();
-
-        {
-          child_mask = NODE_INTERSECT(kg,
-                                      tnear,
-                                      tfar,
-#ifdef __KERNEL_AVX2__
-                                      P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                      org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                      dir4,
-#endif
-                                      idir4,
-                                      near_x,
-                                      near_y,
-                                      near_z,
-                                      far_x,
-                                      far_y,
-                                      far_z,
-                                      node_addr,
-                                      &dist);
-        }
-
-        if (child_mask != 0) {
-          avxf cnodes;
-          /* TODO(sergey): Investigate whether moving cnodes upwards
-           * gives a speedup (will be different cache pattern but will
-           * avoid extra check here).
-           */
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          float d0 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            node_dist = d0;
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              node_dist = d1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              node_dist = d0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        node_dist = traversal_stack[stack_ptr].dist;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-#ifdef __VISIBILITY_FLAG__
-        if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
-#else
-        if (UNLIKELY((node_dist > isect->t)))
-#endif
-        {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (type & PRIMITIVE_ALL) {
-            case PRIMITIVE_TRIANGLE: {
-              int prim_count = prim_addr2 - prim_addr;
-              if (prim_count < 3) {
-                for (; prim_addr < prim_addr2; prim_addr++) {
-                  BVH_DEBUG_NEXT_INTERSECTION();
-                  kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                  if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
-                    tfar = avxf(isect->t);
-                    /* Shadow ray early termination. */
-                    if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                      return true;
-                    }
-                  }
-                }  // for
-              }
-              else {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect8(kg,
-                                        &isect,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        prim_count,
-                                        0,
-                                        0,
-                                        NULL,
-                                        0.0f)) {
-                  tfar = avxf(isect->t);
-                  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }  // prim count
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
-                  tfar = avxf(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE:
-            case PRIMITIVE_MOTION_CURVE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                bool hit;
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(
-                      kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
-                }
-                else {
-                  hit = curve_intersect(
-                      kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
-                }
-                if (hit) {
-                  tfar = avxf(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_HAIR) */
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          qbvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
-#  else
-          qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
-#  endif
-
-          obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-          idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-          traversal_stack[stack_ptr].dist = -FLT_MAX;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-
-          BVH_DEBUG_NEXT_INSTANCE();
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      node_dist = traversal_stack[stack_ptr].dist;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
deleted file mode 100644
index fb41ae783ab..00000000000
--- a/intern/cycles/kernel/bvh/obvh_volume.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  avxf tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-            obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-            idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
deleted file mode 100644
index 56e2afd4a11..00000000000
--- a/intern/cycles/kernel/bvh/obvh_volume_all.h
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint max_hits,
-                                             const uint visibility)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  uint num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  avxf tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-          bool hit;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
-#  if BVH_FEATURE(BVH_MOTION)
-                    float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-                    Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-                    float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-                    for (int i = 0; i < num_hits_in_instance; i++) {
-                      (isect_array - i - 1)->t *= t_fac;
-                    }
-#endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#  endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
-#    if BVH_FEATURE(BVH_MOTION)
-                    float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
-                    Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-                    float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
-                    for (int i = 0; i < num_hits_in_instance; i++) {
-                      (isect_array - i - 1)->t *= t_fac;
-                    }
-#  endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-            obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = avxf(isect_t);
-            idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-            num_hits_in_instance = 0;
-            isect_array->t = isect_t;
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return num_hits;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h
deleted file mode 100644
index b21f79bd3a0..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_local.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for finding local intersections
- * around the shading point, for subsurface scattering and bevel. We disable
- * various features for performance, and for instanced objects avoid traversing
- * other parts of the scene.
- *
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             LocalIntersection *local_isect,
-                                             int local_object,
-                                             uint *lcg_state,
-                                             int max_hits)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps (for non shadow rays).
-   * - Separate version for shadow rays.
-   * - Likely and unlikely for if() statements.
-   * - SSE for hair.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_tex_fetch(__object_node, local_object);
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = ray->t;
-
-  if (local_isect != NULL) {
-    local_isect->num_hits = 0;
-  }
-  kernel_assert((local_isect == NULL) == (max_hits == 0));
-
-  const int object_flag = kernel_tex_fetch(__object_flag, local_object);
-  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#if BVH_FEATURE(BVH_MOTION)
-    Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
-#endif
-    object = local_object;
-  }
-
-  ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-        int prim_addr = __float_as_int(leaf.x);
-
-        int prim_addr2 = __float_as_int(leaf.y);
-        const uint type = __float_as_int(leaf.w);
-
-        /* Pop. */
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-
-        /* Primitive intersection. */
-        switch (type & PRIMITIVE_ALL) {
-          case PRIMITIVE_TRIANGLE: {
-            /* Intersect ray against primitive, */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (triangle_intersect_local(kg,
-                                           local_isect,
-                                           P,
-                                           dir,
-                                           object,
-                                           local_object,
-                                           prim_addr,
-                                           isect_t,
-                                           lcg_state,
-                                           max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#if BVH_FEATURE(BVH_MOTION)
-          case PRIMITIVE_MOTION_TRIANGLE: {
-            /* Intersect ray against primitive. */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (motion_triangle_intersect_local(kg,
-                                                  local_isect,
-                                                  P,
-                                                  dir,
-                                                  ray->time,
-                                                  object,
-                                                  local_object,
-                                                  prim_addr,
-                                                  isect_t,
-                                                  lcg_state,
-                                                  max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#endif
-          default:
-            break;
-        }
-      }
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
deleted file mode 100644
index 070406fb18a..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Aligned nodes intersection SSE code is adopted from Embree,
- */
-
-struct QBVHStackItem {
-  int addr;
-  float dist;
-};
-
-ccl_device_inline void qbvh_near_far_idx_calc(const float3 &idir,
-                                              int *ccl_restrict near_x,
-                                              int *ccl_restrict near_y,
-                                              int *ccl_restrict near_z,
-                                              int *ccl_restrict far_x,
-                                              int *ccl_restrict far_y,
-                                              int *ccl_restrict far_z)
-
-{
-#ifdef __KERNEL_SSE__
-  *near_x = 0;
-  *far_x = 1;
-  *near_y = 2;
-  *far_y = 3;
-  *near_z = 4;
-  *far_z = 5;
-
-  const size_t mask = movemask(ssef(idir.m128));
-
-  const int mask_x = mask & 1;
-  const int mask_y = (mask & 2) >> 1;
-  const int mask_z = (mask & 4) >> 2;
-
-  *near_x += mask_x;
-  *far_x -= mask_x;
-  *near_y += mask_y;
-  *far_y -= mask_y;
-  *near_z += mask_z;
-  *far_z -= mask_z;
-#else
-  if (idir.x >= 0.0f) {
-    *near_x = 0;
-    *far_x = 1;
-  }
-  else {
-    *near_x = 1;
-    *far_x = 0;
-  }
-  if (idir.y >= 0.0f) {
-    *near_y = 2;
-    *far_y = 3;
-  }
-  else {
-    *near_y = 3;
-    *far_y = 2;
-  }
-  if (idir.z >= 0.0f) {
-    *near_z = 4;
-    *far_z = 5;
-  }
-  else {
-    *near_z = 5;
-    *far_z = 4;
-  }
-#endif
-}
-
-/* TOOD(sergey): Investigate if using intrinsics helps for both
- * stack item swap and float comparison.
- */
-ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, QBVHStackItem *ccl_restrict b)
-{
-  QBVHStackItem tmp = *a;
-  *a = *b;
-  *b = tmp;
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
-                                       QBVHStackItem *ccl_restrict s2,
-                                       QBVHStackItem *ccl_restrict s3)
-{
-  if (s2->dist < s1->dist) {
-    qbvh_item_swap(s2, s1);
-  }
-  if (s3->dist < s2->dist) {
-    qbvh_item_swap(s3, s2);
-  }
-  if (s2->dist < s1->dist) {
-    qbvh_item_swap(s2, s1);
-  }
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
-                                       QBVHStackItem *ccl_restrict s2,
-                                       QBVHStackItem *ccl_restrict s3,
-                                       QBVHStackItem *ccl_restrict s4)
-{
-  if (s2->dist < s1->dist) {
-    qbvh_item_swap(s2, s1);
-  }
-  if (s4->dist < s3->dist) {
-    qbvh_item_swap(s4, s3);
-  }
-  if (s3->dist < s1->dist) {
-    qbvh_item_swap(s3, s1);
-  }
-  if (s4->dist < s2->dist) {
-    qbvh_item_swap(s4, s2);
-  }
-  if (s3->dist < s2->dist) {
-    qbvh_item_swap(s3, s2);
-  }
-}
-
-/* Axis-aligned nodes intersection */
-
-// ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
-static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                       const ssef &isect_near,
-                                       const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                       const sse3f &org_idir,
-#else
-                                       const sse3f &org,
-#endif
-                                       const sse3f &idir,
-                                       const int near_x,
-                                       const int near_y,
-                                       const int near_z,
-                                       const int far_x,
-                                       const int far_y,
-                                       const int far_z,
-                                       const int node_addr,
-                                       ssef *ccl_restrict dist)
-{
-  const int offset = node_addr + 1;
-#ifdef __KERNEL_AVX2__
-  const ssef tnear_x = msub(
-      kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, org_idir.x);
-  const ssef tnear_y = msub(
-      kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, org_idir.y);
-  const ssef tnear_z = msub(
-      kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, org_idir.z);
-  const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, org_idir.x);
-  const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, org_idir.y);
-  const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, org_idir.z);
-#else
-  const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - org.x) * idir.x;
-  const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - org.y) * idir.y;
-  const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - org.z) * idir.z;
-  const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - org.x) * idir.x;
-  const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - org.y) * idir.y;
-  const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - org.z) * idir.z;
-#endif
-
-#ifdef __KERNEL_SSE41__
-  const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near));
-  const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far));
-  const sseb vmask = cast(tnear) > cast(tfar);
-  int mask = (int)movemask(vmask) ^ 0xf;
-#else
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = tnear <= tfar;
-  int mask = (int)movemask(vmask);
-#endif
-  *dist = tnear;
-  return mask;
-}
-
-/* Unaligned nodes intersection */
-
-ccl_device_inline int qbvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                                    const ssef &isect_near,
-                                                    const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                    const sse3f &org_idir,
-#endif
-                                                    const sse3f &org,
-                                                    const sse3f &dir,
-                                                    const sse3f &idir,
-                                                    const int near_x,
-                                                    const int near_y,
-                                                    const int near_z,
-                                                    const int far_x,
-                                                    const int far_y,
-                                                    const int far_z,
-                                                    const int node_addr,
-                                                    ssef *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1);
-  const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2);
-  const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3);
-
-  const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4);
-  const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5);
-  const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6);
-
-  const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7);
-  const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8);
-  const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9);
-
-  const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10);
-  const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11);
-  const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12);
-
-  const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
-             aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
-             aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
-
-  const ssef aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x,
-             aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y,
-             aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z;
-
-  const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
-  const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
-             nrdir_z = neg_one / aligned_dir_z;
-
-  const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
-             tlower_z = aligned_P_z * nrdir_z;
-
-  const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
-             tupper_z = tlower_z - nrdir_z;
-
-#ifdef __KERNEL_SSE41__
-  const ssef tnear_x = mini(tlower_x, tupper_x);
-  const ssef tnear_y = mini(tlower_y, tupper_y);
-  const ssef tnear_z = mini(tlower_z, tupper_z);
-  const ssef tfar_x = maxi(tlower_x, tupper_x);
-  const ssef tfar_y = maxi(tlower_y, tupper_y);
-  const ssef tfar_z = maxi(tlower_z, tupper_z);
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = tnear <= tfar;
-  *dist = tnear;
-  return movemask(vmask);
-#else
-  const ssef tnear_x = min(tlower_x, tupper_x);
-  const ssef tnear_y = min(tlower_y, tupper_y);
-  const ssef tnear_z = min(tlower_z, tupper_z);
-  const ssef tfar_x = max(tlower_x, tupper_x);
-  const ssef tfar_y = max(tlower_y, tupper_y);
-  const ssef tfar_z = max(tlower_z, tupper_z);
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = tnear <= tfar;
-  *dist = tnear;
-  return movemask(vmask);
-#endif
-}
-
-/* Intersectors wrappers.
- *
- * They'll check node type and call appropriate intersection code.
- */
-
-ccl_device_inline int qbvh_node_intersect(KernelGlobals *ccl_restrict kg,
-                                          const ssef &isect_near,
-                                          const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                          const sse3f &org_idir,
-#endif
-                                          const sse3f &org,
-                                          const sse3f &dir,
-                                          const sse3f &idir,
-                                          const int near_x,
-                                          const int near_y,
-                                          const int near_z,
-                                          const int far_x,
-                                          const int far_y,
-                                          const int far_z,
-                                          const int node_addr,
-                                          ssef *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return qbvh_unaligned_node_intersect(kg,
-                                         isect_near,
-                                         isect_far,
-#ifdef __KERNEL_AVX2__
-                                         org_idir,
-#endif
-                                         org,
-                                         dir,
-                                         idir,
-                                         near_x,
-                                         near_y,
-                                         near_z,
-                                         far_x,
-                                         far_y,
-                                         far_z,
-                                         node_addr,
-                                         dist);
-  }
-  else {
-    return qbvh_aligned_node_intersect(kg,
-                                       isect_near,
-                                       isect_far,
-#ifdef __KERNEL_AVX2__
-                                       org_idir,
-#else
-                                       org,
-#endif
-                                       idir,
-                                       near_x,
-                                       near_y,
-                                       near_z,
-                                       far_x,
-                                       far_y,
-                                       far_z,
-                                       node_addr,
-                                       dist);
-  }
-}
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
deleted file mode 100644
index 682251bf25b..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint visibility,
-                                             const uint max_hits,
-                                             uint *num_hits)
-{
-  /* TODO(sergey):
-   *  - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  *num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (false
-#ifdef __VISIBILITY_FLAG__
-            || ((__float_as_uint(inodes.x) & visibility) == 0)
-#endif
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          while (prim_addr < prim_addr2) {
-            kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-            bool hit;
-
-            /* todo: specialized intersect functions which don't fill in
-             * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
-             * might give a few % performance improvement */
-
-            switch (p_type) {
-              case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
-                break;
-              }
-#if BVH_FEATURE(BVH_MOTION)
-              case PRIMITIVE_MOTION_TRIANGLE: {
-                hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
-                break;
-              }
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-              case PRIMITIVE_CURVE:
-              case PRIMITIVE_MOTION_CURVE: {
-                const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect_array,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect_array,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type);
-                }
-                break;
-              }
-#endif
-              default: {
-                hit = false;
-                break;
-              }
-            }
-
-            /* Shadow ray early termination. */
-            if (hit) {
-              /* detect if this surface has a shader with transparent shadows */
-
-              /* todo: optimize so primitive visibility flag indicates if
-               * the primitive has a transparent shadow shader? */
-              int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-              int shader = 0;
-
-#ifdef __HAIR__
-              if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-              {
-                shader = kernel_tex_fetch(__tri_shader, prim);
-              }
-#ifdef __HAIR__
-              else {
-                float4 str = kernel_tex_fetch(__curves, prim);
-                shader = __float_as_int(str.z);
-              }
-#endif
-              int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-              /* if no transparent shadows, all light is blocked */
-              if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-                return true;
-              }
-              /* if maximum number of hits reached, block all light */
-              else if (*num_hits == max_hits) {
-                return true;
-              }
-
-              /* move on to next entry in intersections array */
-              isect_array++;
-              (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-              num_hits_in_instance++;
-#endif
-
-              isect_array->t = isect_t;
-            }
-
-            prim_addr++;
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
-
-          qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-          idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
deleted file mode 100644
index f43e84bf368..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps (for non shadow rays).
-   * - Separate version for shadow rays.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-  traversal_stack[0].dist = -FLT_MAX;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-  float node_dist = -FLT_MAX;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  BVH_DEBUG_INIT();
-
-  ssef tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (UNLIKELY(node_dist > isect->t)
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-#ifdef __VISIBILITY_FLAG__
-            || (__float_as_uint(inodes.x) & visibility) == 0
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        int child_mask;
-        ssef dist;
-
-        BVH_DEBUG_NEXT_NODE();
-
-        {
-          child_mask = NODE_INTERSECT(kg,
-                                      tnear,
-                                      tfar,
-#ifdef __KERNEL_AVX2__
-                                      P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                      org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                      dir4,
-#endif
-                                      idir4,
-                                      near_x,
-                                      near_y,
-                                      near_z,
-                                      far_x,
-                                      far_y,
-                                      far_z,
-                                      node_addr,
-                                      &dist);
-        }
-
-        if (child_mask != 0) {
-          float4 cnodes;
-          /* TODO(sergey): Investigate whether moving cnodes upwards
-           * gives a speedup (will be different cache pattern but will
-           * avoid extra check here).
-           */
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          float d0 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            node_dist = d0;
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              node_dist = d1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              node_dist = d0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        node_dist = traversal_stack[stack_ptr].dist;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-#ifdef __VISIBILITY_FLAG__
-        if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
-#else
-        if (UNLIKELY((node_dist > isect->t)))
-#endif
-        {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (type & PRIMITIVE_ALL) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
-                  tfar = ssef(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
-                  tfar = ssef(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE:
-            case PRIMITIVE_MOTION_CURVE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                bool hit;
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(
-                      kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
-                }
-                else {
-                  hit = curve_intersect(
-                      kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
-                }
-                if (hit) {
-                  tfar = ssef(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_HAIR) */
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          qbvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
-#  else
-          qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
-#  endif
-
-          qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-          idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-          traversal_stack[stack_ptr].dist = -FLT_MAX;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-
-          BVH_DEBUG_NEXT_INSTANCE();
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      node_dist = traversal_stack[stack_ptr].dist;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
deleted file mode 100644
index e4eaed04467..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  ssef tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-            qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-            idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
deleted file mode 100644
index eddc48c487e..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint max_hits,
-                                             const uint visibility)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  uint num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-          bool hit;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
-                    if (object != OBJECT_NONE) {
-#  if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
-#endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#  endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
-                    if (object != OBJECT_NONE) {
-#    if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
-#  endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-            qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = ssef(isect_t);
-            idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-            num_hits_in_instance = 0;
-            isect_array->t = isect_t;
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return num_hits;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 4cc61e8ee71..6070fd983f5 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -119,13 +119,16 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
                                   differential3 *domega_in,
                                   float *pdf)
 {
+  /* For curves use the smooth normal, particularly for ribbons the geometric
+   * normal gives too much darkening otherwise. */
   int label;
+  const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sc->N : sd->Ng;
 
   switch (sc->type) {
     case CLOSURE_BSDF_DIFFUSE_ID:
     case CLOSURE_BSDF_BSSRDF_ID:
       label = bsdf_diffuse_sample(sc,
-                                  sd->Ng,
+                                  Ng,
                                   sd->I,
                                   sd->dI.dx,
                                   sd->dI.dy,
@@ -140,7 +143,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #ifdef __SVM__
     case CLOSURE_BSDF_OREN_NAYAR_ID:
       label = bsdf_oren_nayar_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -155,7 +158,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #  ifdef __OSL__
     case CLOSURE_BSDF_PHONG_RAMP_ID:
       label = bsdf_phong_ramp_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -169,7 +172,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
       label = bsdf_diffuse_ramp_sample(sc,
-                                       sd->Ng,
+                                       Ng,
                                        sd->I,
                                        sd->dI.dx,
                                        sd->dI.dy,
@@ -184,7 +187,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #  endif
     case CLOSURE_BSDF_TRANSLUCENT_ID:
       label = bsdf_translucent_sample(sc,
-                                      sd->Ng,
+                                      Ng,
                                       sd->I,
                                       sd->dI.dx,
                                       sd->dI.dy,
@@ -198,7 +201,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_REFLECTION_ID:
       label = bsdf_reflection_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -212,7 +215,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_REFRACTION_ID:
       label = bsdf_refraction_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -226,7 +229,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_TRANSPARENT_ID:
       label = bsdf_transparent_sample(sc,
-                                      sd->Ng,
+                                      Ng,
                                       sd->I,
                                       sd->dI.dx,
                                       sd->dI.dy,
@@ -244,7 +247,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
       label = bsdf_microfacet_ggx_sample(kg,
                                          sc,
-                                         sd->Ng,
+                                         Ng,
                                          sd->I,
                                          sd->dI.dx,
                                          sd->dI.dy,
@@ -260,7 +263,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
       label = bsdf_microfacet_multi_ggx_sample(kg,
                                                sc,
-                                               sd->Ng,
+                                               Ng,
                                                sd->I,
                                                sd->dI.dx,
                                                sd->dI.dy,
@@ -277,7 +280,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
       label = bsdf_microfacet_multi_ggx_glass_sample(kg,
                                                      sc,
-                                                     sd->Ng,
+                                                     Ng,
                                                      sd->I,
                                                      sd->dI.dx,
                                                      sd->dI.dy,
@@ -294,7 +297,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
       label = bsdf_microfacet_beckmann_sample(kg,
                                               sc,
-                                              sd->Ng,
+                                              Ng,
                                               sd->I,
                                               sd->dI.dx,
                                               sd->dI.dy,
@@ -308,7 +311,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
       label = bsdf_ashikhmin_shirley_sample(sc,
-                                            sd->Ng,
+                                            Ng,
                                             sd->I,
                                             sd->dI.dx,
                                             sd->dI.dy,
@@ -322,7 +325,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
       label = bsdf_ashikhmin_velvet_sample(sc,
-                                           sd->Ng,
+                                           Ng,
                                            sd->I,
                                            sd->dI.dx,
                                            sd->dI.dy,
@@ -336,7 +339,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_DIFFUSE_TOON_ID:
       label = bsdf_diffuse_toon_sample(sc,
-                                       sd->Ng,
+                                       Ng,
                                        sd->I,
                                        sd->dI.dx,
                                        sd->dI.dy,
@@ -350,7 +353,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_GLOSSY_TOON_ID:
       label = bsdf_glossy_toon_sample(sc,
-                                      sd->Ng,
+                                      Ng,
                                       sd->I,
                                       sd->dI.dx,
                                       sd->dI.dy,
@@ -364,7 +367,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_HAIR_REFLECTION_ID:
       label = bsdf_hair_reflection_sample(sc,
-                                          sd->Ng,
+                                          Ng,
                                           sd->I,
                                           sd->dI.dx,
                                           sd->dI.dy,
@@ -378,7 +381,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
       label = bsdf_hair_transmission_sample(sc,
-                                            sd->Ng,
+                                            Ng,
                                             sd->I,
                                             sd->dI.dx,
                                             sd->dI.dy,
@@ -398,7 +401,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
     case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
       label = bsdf_principled_diffuse_sample(sc,
-                                             sd->Ng,
+                                             Ng,
                                              sd->I,
                                              sd->dI.dx,
                                              sd->dI.dy,
@@ -412,7 +415,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
       label = bsdf_principled_sheen_sample(sc,
-                                           sd->Ng,
+                                           Ng,
                                            sd->I,
                                            sd->dI.dx,
                                            sd->dI.dy,
@@ -485,9 +488,12 @@ ccl_device_inline
               const float3 omega_in,
               float *pdf)
 {
+  /* For curves use the smooth normal, particularly for ribbons the geometric
+   * normal gives too much darkening otherwise. */
+  const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sd->N : sd->Ng;
   float3 eval;
 
-  if (dot(sd->Ng, omega_in) >= 0.0f) {
+  if (dot(Ng, omega_in) >= 0.0f) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
       case CLOSURE_BSDF_BSSRDF_ID:
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index f78bbeb5d9d..389bd62ba68 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -206,9 +206,6 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
   float3 X = safe_normalize(sd->dPdu);
   float3 Y = safe_normalize(cross(X, sd->I));
   float3 Z = safe_normalize(cross(X, Y));
-  /* TODO: the solution below works where sd->Ng is the normal
-   * pointing from the center of the curve to the shading point.
-   * It doesn't work for triangles, see https://developer.blender.org/T43625 */
 
   /* h -1..0..1 means the rays goes from grazing the hair, to hitting it at
    * the center, to grazing the other edge. This is the sine of the angle
@@ -216,7 +213,9 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
 
   /* TODO: we convert this value to a cosine later and discard the sign, so
    * we could probably save some operations. */
-  float h = dot(cross(sd->Ng, X), Z);
+  float h = (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) ?
+                -sd->v :
+                dot(cross(sd->Ng, X), Z);
 
   kernel_assert(fabsf(h) < 1.0f + 1e-4f);
   kernel_assert(isfinite3_safe(Y));
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 928cad58452..6ff0c7f2044 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -23,33 +23,6 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
-/* Interpolation of curve geometry */
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-  float fc = 0.71f;
-  float data[4];
-  float t2 = t * t;
-  data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc;
-  data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t;
-  data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc;
-  data[3] = 3.0f * fc * t2 - 2.0f * fc * t;
-  return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-  float data[4];
-  float fc = 0.71f;
-  float t2 = t * t;
-  float t3 = t2 * t;
-  data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t;
-  data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f;
-  data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t;
-  data[3] = fc * t3 - fc * t2;
-  return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
 /* Reading attributes on various curve elements */
 
 ccl_device float curve_attribute_float(
@@ -225,6 +198,66 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
   }
 }
 
+ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+                                         const ShaderData *sd,
+                                         const AttributeDescriptor desc,
+                                         float4 *dx,
+                                         float4 *dy)
+{
+  if (desc.element == ATTR_ELEMENT_CURVE) {
+    /* idea: we can't derive any useful differentials here, but for tiled
+     * mipmap image caching it would be useful to avoid reading the highest
+     * detail level always. maybe a derivative based on the hair density
+     * could be computed somehow? */
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim);
+  }
+  else if (desc.element == ATTR_ELEMENT_CURVE_KEY ||
+           desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+    float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+    int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+    int k1 = k0 + 1;
+
+    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = sd->du.dx * (f1 - f0);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return (1.0f - sd->u) * f0 + sd->u * f1;
+  }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return kernel_tex_fetch(__attributes_float3, desc.offset);
+  }
+  else {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+}
+
 /* Curve thickness */
 
 ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
@@ -238,12 +271,12 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
     float4 P_curve[2];
 
-    if (sd->type & PRIMITIVE_CURVE) {
+    if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
       P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
       P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
     }
     else {
-      motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+      motion_curve_keys_linear(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
     }
 
     r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index 7a770470150..c04dbee52cc 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -1,4 +1,7 @@
 /*
+ * Copyright 2009-2020 Intel Corporation. Adapted from Embree with
+ * with modifications.
+ *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
@@ -14,501 +17,620 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Curve primitive intersection functions. */
+/* Curve primitive intersection functions.
+ *
+ * The code here was adapted from curve_intersector_sweep.h in Embree, to get
+ * an exact match between Embree CPU ray-tracing and our GPU ray-tracing. */
+
+#define CURVE_NUM_BEZIER_SUBDIVISIONS 3
+#define CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE (CURVE_NUM_BEZIER_SUBDIVISIONS + 1)
+#define CURVE_NUM_BEZIER_STEPS 2
+#define CURVE_NUM_JACOBIAN_ITERATIONS 5
 
 #ifdef __HAIR__
 
-#  ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+/* Catmull-rom curve evaluation. */
+
+ccl_device_inline float4 catmull_rom_basis_eval(const float4 curve[4], float u)
 {
-  return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+  const float t = u;
+  const float s = 1.0f - u;
+  const float n0 = -t * s * s;
+  const float n1 = 2.0f + t * t * (3.0f * t - 5.0f);
+  const float n2 = 2.0f + s * s * (3.0f * s - 5.0f);
+  const float n3 = -s * t * t;
+  return 0.5f * (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3);
 }
-#  endif
 
-/* On CPU pass P and dir by reference to aligned vector. */
-ccl_device_forceinline bool cardinal_curve_intersect(KernelGlobals *kg,
-                                                     Intersection *isect,
-                                                     const float3 ccl_ref P,
-                                                     const float3 ccl_ref dir,
-                                                     uint visibility,
-                                                     int object,
-                                                     int curveAddr,
-                                                     float time,
-                                                     int type)
+ccl_device_inline float4 catmull_rom_basis_derivative(const float4 curve[4], float u)
 {
-  const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+  const float t = u;
+  const float s = 1.0f - u;
+  const float n0 = -s * s + 2.0f * s * t;
+  const float n1 = 2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t;
+  const float n2 = 2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s;
+  const float n3 = -2.0f * s * t + t * t;
+  return 0.5f * (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3);
+}
 
-#  ifndef __KERNEL_OPTIX__ /* see OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */
-  if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-    const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-    if (time < prim_time.x || time > prim_time.y) {
-      return false;
-    }
-  }
-#  endif
+ccl_device_inline float4 catmull_rom_basis_derivative2(const float4 curve[4], float u)
+{
 
-  int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-  float epsilon = 0.0f;
-  float r_st, r_en;
+  const float t = u;
+  const float n0 = -3.0f * t + 2.0f;
+  const float n1 = 9.0f * t - 5.0f;
+  const float n2 = -9.0f * t + 4.0f;
+  const float n3 = 3.0f * t - 1.0f;
+  return (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3);
+}
 
-  int depth = kernel_data.curve.subdivisions;
-  int flags = kernel_data.curve.curveflags;
-  int prim = kernel_tex_fetch(__prim_index, curveAddr);
+/* Thick Curve */
 
-#  ifdef __KERNEL_SSE2__
-  ssef vdir = load4f(dir);
-  ssef vcurve_coef[4];
-  const float3 *curve_coef = (float3 *)vcurve_coef;
+ccl_device_inline float3 dnormalize(const float3 p, const float3 dp)
+{
+  const float pp = dot(p, p);
+  const float pdp = dot(p, dp);
+  return (pp * dp - pdp * p) / (pp * sqrtf(pp));
+}
 
-  {
-    ssef dtmp = vdir * vdir;
-    ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
-    ssef rd_ss = load1f_first(1.0f) / d_ss;
-
-    ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
-    int2 &v00 = (int2 &)v00vec;
-
-    int k0 = v00.x + segment;
-    int k1 = k0 + 1;
-    int ka = max(k0 - 1, v00.x);
-    int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-#    if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && \
-        (!defined(_MSC_VER) || _MSC_VER > 1800)
-    avxf P_curve_0_1, P_curve_2_3;
-    if (is_curve_primitive) {
-      P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
-      P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
-    }
-    else {
-      int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-      motion_cardinal_curve_keys_avx(
-          kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1, &P_curve_2_3);
-    }
-#    else  /* __KERNEL_AVX2__ */
-    ssef P_curve[4];
-
-    if (is_curve_primitive) {
-      P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
-      P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
-      P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
-      P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+ccl_device_inline float sqr_point_to_line_distance(const float3 PmQ0, const float3 Q1mQ0)
+{
+  const float3 N = cross(PmQ0, Q1mQ0);
+  const float3 D = Q1mQ0;
+  return dot(N, N) / dot(D, D);
+}
+
+ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
+                                          const float3 cylinder_end,
+                                          const float cylinder_radius,
+                                          const float3 ray_dir,
+                                          float2 *t_o,
+                                          float *u0_o,
+                                          float3 *Ng0_o,
+                                          float *u1_o,
+                                          float3 *Ng1_o)
+{
+  /* Calculate quadratic equation to solve. */
+  const float rl = 1.0f / len(cylinder_end - cylinder_start);
+  const float3 P0 = cylinder_start, dP = (cylinder_end - cylinder_start) * rl;
+  const float3 O = -P0, dO = ray_dir;
+
+  const float dOdO = dot(dO, dO);
+  const float OdO = dot(dO, O);
+  const float OO = dot(O, O);
+  const float dOz = dot(dP, dO);
+  const float Oz = dot(dP, O);
+
+  const float A = dOdO - sqr(dOz);
+  const float B = 2.0f * (OdO - dOz * Oz);
+  const float C = OO - sqr(Oz) - sqr(cylinder_radius);
+
+  /* We miss the cylinder if determinant is smaller than zero. */
+  const float D = B * B - 4.0f * A * C;
+  if (!(D >= 0.0f)) {
+    *t_o = make_float2(FLT_MAX, -FLT_MAX);
+    return false;
+  }
+
+  /* Special case for rays that are parallel to the cylinder. */
+  const float eps = 16.0f * FLT_EPSILON * max(fabsf(dOdO), fabsf(sqr(dOz)));
+  if (fabsf(A) < eps) {
+    if (C <= 0.0f) {
+      *t_o = make_float2(-FLT_MAX, FLT_MAX);
+      return true;
     }
     else {
-      int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-      motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4 *)&P_curve);
+      *t_o = make_float2(-FLT_MAX, FLT_MAX);
+      return false;
     }
-#    endif /* __KERNEL_AVX2__ */
-
-    ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
-    ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
-    ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
-    ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-    ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
-    ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-    ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
-    ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-#    if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && \
-        (!defined(_MSC_VER) || _MSC_VER > 1800)
-    const avxf vPP = _mm256_broadcast_ps(&P.m128);
-    const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
-    const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
-    const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
-    const avxf p01 = madd(
-        shuffle<0>(P_curve_0_1 - vPP),
-        htfm00,
-        madd(shuffle<1>(P_curve_0_1 - vPP), htfm11, shuffle<2>(P_curve_0_1 - vPP) * htfm22));
-    const avxf p23 = madd(
-        shuffle<0>(P_curve_2_3 - vPP),
-        htfm00,
-        madd(shuffle<1>(P_curve_2_3 - vPP), htfm11, shuffle<2>(P_curve_2_3 - vPP) * htfm22));
-
-    const ssef p0 = _mm256_castps256_ps128(p01);
-    const ssef p1 = _mm256_extractf128_ps(p01, 1);
-    const ssef p2 = _mm256_castps256_ps128(p23);
-    const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
-    const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
-    r_st = ((float4 &)P_curve_1).w;
-    const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
-    r_en = ((float4 &)P_curve_2).w;
-#    else  /* __KERNEL_AVX2__ */
-    ssef htfm[] = {htfm0, htfm1, htfm2};
-    ssef vP = load4f(P);
-    ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
-    ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
-    ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
-    ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
-    r_st = ((float4 &)P_curve[1]).w;
-    r_en = ((float4 &)P_curve[2]).w;
-#    endif /* __KERNEL_AVX2__ */
-
-    float fc = 0.71f;
-    ssef vfc = ssef(fc);
-    ssef vfcxp3 = vfc * p3;
-
-    vcurve_coef[0] = p1;
-    vcurve_coef[1] = vfc * (p2 - p0);
-    vcurve_coef[2] = madd(
-        ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
-    vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
   }
-#  else
-  float3 curve_coef[4];
 
-  /* curve Intersection check */
-  /* obtain curve parameters */
+  /* Standard case for rays that are not parallel to the cylinder. */
+  const float Q = sqrtf(D);
+  const float rcp_2A = 1.0f / (2.0f * A);
+  const float t0 = (-B - Q) * rcp_2A;
+  const float t1 = (-B + Q) * rcp_2A;
+
+  /* Calculates u and Ng for near hit. */
   {
-    /* ray transform created - this should be created at beginning of intersection loop */
-    Transform htfm;
-    float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-    htfm = make_transform(dir.z / d,
-                          0,
-                          -dir.x / d,
-                          0,
-                          -dir.x * dir.y / d,
-                          d,
-                          -dir.y * dir.z / d,
-                          0,
-                          dir.x,
-                          dir.y,
-                          dir.z,
-                          0);
-
-    float4 v00 = kernel_tex_fetch(__curves, prim);
-
-    int k0 = __float_as_int(v00.x) + segment;
-    int k1 = k0 + 1;
-
-    int ka = max(k0 - 1, __float_as_int(v00.x));
-    int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-    float4 P_curve[4];
-
-    if (is_curve_primitive) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-      P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-    }
-    else {
-      int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-      motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
-    }
+    *u0_o = (t0 * dOz + Oz) * rl;
+    const float3 Pr = t0 * ray_dir;
+    const float3 Pl = (*u0_o) * (cylinder_end - cylinder_start) + cylinder_start;
+    *Ng0_o = Pr - Pl;
+  }
 
-    float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
-    float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
-    float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
-    float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
-    float fc = 0.71f;
-    curve_coef[0] = p1;
-    curve_coef[1] = -fc * p0 + fc * p2;
-    curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-    curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-    r_st = P_curve[1].w;
-    r_en = P_curve[2].w;
+  /* Calculates u and Ng for far hit. */
+  {
+    *u1_o = (t1 * dOz + Oz) * rl;
+    const float3 Pr = t1 * ray_dir;
+    const float3 Pl = (*u1_o) * (cylinder_end - cylinder_start) + cylinder_start;
+    *Ng1_o = Pr - Pl;
   }
-#  endif
 
-  float r_curr = max(r_st, r_en);
-
-  if ((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-    epsilon = 2 * r_curr;
-
-  /* find bounds - this is slow for cubic curves */
-  float upper, lower;
-
-  float zextrem[4];
-  curvebounds(&lower,
-              &upper,
-              &zextrem[0],
-              &zextrem[1],
-              &zextrem[2],
-              &zextrem[3],
-              curve_coef[0].z,
-              curve_coef[1].z,
-              curve_coef[2].z,
-              curve_coef[3].z);
-  if (lower - r_curr > isect->t || upper + r_curr < epsilon)
-    return false;
+  *t_o = make_float2(t0, t1);
 
-  /* minimum width extension */
-  float xextrem[4];
-  curvebounds(&lower,
-              &upper,
-              &xextrem[0],
-              &xextrem[1],
-              &xextrem[2],
-              &xextrem[3],
-              curve_coef[0].x,
-              curve_coef[1].x,
-              curve_coef[2].x,
-              curve_coef[3].x);
-  if (lower > r_curr || upper < -r_curr)
-    return false;
+  return true;
+}
 
-  float yextrem[4];
-  curvebounds(&lower,
-              &upper,
-              &yextrem[0],
-              &yextrem[1],
-              &yextrem[2],
-              &yextrem[3],
-              curve_coef[0].y,
-              curve_coef[1].y,
-              curve_coef[2].y,
-              curve_coef[3].y);
-  if (lower > r_curr || upper < -r_curr)
-    return false;
+ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_dir)
+{
+  const float3 O = -P;
+  const float3 D = ray_dir;
+  const float ON = dot(O, N);
+  const float DN = dot(D, N);
+  const float min_rcp_input = 1e-18f;
+  const bool eps = fabsf(DN) < min_rcp_input;
+  const float t = -ON / DN;
+  const float lower = (eps || DN < 0.0f) ? -FLT_MAX : t;
+  const float upper = (eps || DN > 0.0f) ? FLT_MAX : t;
+  return make_float2(lower, upper);
+}
 
-  /* setup recurrent loop */
-  int level = 1 << depth;
-  int tree = 0;
-  float resol = 1.0f / (float)level;
-  bool hit = false;
-
-  /* begin loop */
-  while (!(tree >> (depth))) {
-    const float i_st = tree * resol;
-    const float i_en = i_st + (level * resol);
-
-#  ifdef __KERNEL_SSE2__
-    ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
-    ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]),
-                      vi_st,
-                      vcurve_coef[0]);
-    ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]),
-                      vi_en,
-                      vcurve_coef[0]);
-
-    ssef vbmin = min(vp_st, vp_en);
-    ssef vbmax = max(vp_st, vp_en);
-
-    float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-    float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-    float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-    float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#  else
-    float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st +
-                  curve_coef[0];
-    float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en +
-                  curve_coef[0];
-
-    float bminx = min(p_st.x, p_en.x);
-    float bmaxx = max(p_st.x, p_en.x);
-    float bminy = min(p_st.y, p_en.y);
-    float bmaxy = max(p_st.y, p_en.y);
-    float bminz = min(p_st.z, p_en.z);
-    float bmaxz = max(p_st.z, p_en.z);
-#  endif
+ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+                                          const float dt,
+                                          const float4 curve[4],
+                                          float u,
+                                          float t,
+                                          const bool use_backfacing,
+                                          Intersection *isect)
+{
+  const float length_ray_dir = len(ray_dir);
+
+  /* Error of curve evaluations is proportional to largest coordinate. */
+  const float4 box_min = min(min(curve[0], curve[1]), min(curve[2], curve[3]));
+  const float4 box_max = max(min(curve[0], curve[1]), max(curve[2], curve[3]));
+  const float4 box_abs = max(fabs(box_min), fabs(box_max));
+  const float P_err = 16.0f * FLT_EPSILON *
+                      max(box_abs.x, max(box_abs.y, max(box_abs.z, box_abs.w)));
+  const float radius_max = box_max.w;
+
+  for (int i = 0; i < CURVE_NUM_JACOBIAN_ITERATIONS; i++) {
+    const float3 Q = ray_dir * t;
+    const float3 dQdt = ray_dir;
+    const float Q_err = 16.0f * FLT_EPSILON * length_ray_dir * t;
+
+    const float4 P4 = catmull_rom_basis_eval(curve, u);
+    const float4 dPdu4 = catmull_rom_basis_derivative(curve, u);
+
+    const float3 P = float4_to_float3(P4);
+    const float3 dPdu = float4_to_float3(dPdu4);
+    const float radius = P4.w;
+    const float dradiusdu = dPdu4.w;
+
+    const float3 ddPdu = float4_to_float3(catmull_rom_basis_derivative2(curve, u));
+
+    const float3 R = Q - P;
+    const float len_R = len(R);
+    const float R_err = max(Q_err, P_err);
+    const float3 dRdu = -dPdu;
+    const float3 dRdt = dQdt;
+
+    const float3 T = normalize(dPdu);
+    const float3 dTdu = dnormalize(dPdu, ddPdu);
+    const float cos_err = P_err / len(dPdu);
+
+    const float f = dot(R, T);
+    const float f_err = len_R * P_err + R_err + cos_err * (1.0f + len_R);
+    const float dfdu = dot(dRdu, T) + dot(R, dTdu);
+    const float dfdt = dot(dRdt, T);
+
+    const float K = dot(R, R) - sqr(f);
+    const float dKdu = (dot(R, dRdu) - f * dfdu);
+    const float dKdt = (dot(R, dRdt) - f * dfdt);
+    const float rsqrt_K = inversesqrtf(K);
+
+    const float g = sqrtf(K) - radius;
+    const float g_err = R_err + f_err + 16.0f * FLT_EPSILON * radius_max;
+    const float dgdu = dKdu * rsqrt_K - dradiusdu;
+    const float dgdt = dKdt * rsqrt_K;
+
+    const float invdet = 1.0f / (dfdu * dgdt - dgdu * dfdt);
+    u -= (dgdt * f - dfdt * g) * invdet;
+    t -= (-dgdu * f + dfdu * g) * invdet;
+
+    if (fabsf(f) < f_err && fabsf(g) < g_err) {
+      t += dt;
+      if (!(0.0f <= t && t <= isect->t)) {
+        return false; /* Rejects NaNs */
+      }
+      if (!(u >= 0.0f && u <= 1.0f)) {
+        return false; /* Rejects NaNs */
+      }
 
-    if (xextrem[0] >= i_st && xextrem[0] <= i_en) {
-      bminx = min(bminx, xextrem[1]);
-      bmaxx = max(bmaxx, xextrem[1]);
-    }
-    if (xextrem[2] >= i_st && xextrem[2] <= i_en) {
-      bminx = min(bminx, xextrem[3]);
-      bmaxx = max(bmaxx, xextrem[3]);
-    }
-    if (yextrem[0] >= i_st && yextrem[0] <= i_en) {
-      bminy = min(bminy, yextrem[1]);
-      bmaxy = max(bmaxy, yextrem[1]);
-    }
-    if (yextrem[2] >= i_st && yextrem[2] <= i_en) {
-      bminy = min(bminy, yextrem[3]);
-      bmaxy = max(bmaxy, yextrem[3]);
-    }
-    if (zextrem[0] >= i_st && zextrem[0] <= i_en) {
-      bminz = min(bminz, zextrem[1]);
-      bmaxz = max(bmaxz, zextrem[1]);
-    }
-    if (zextrem[2] >= i_st && zextrem[2] <= i_en) {
-      bminz = min(bminz, zextrem[3]);
-      bmaxz = max(bmaxz, zextrem[3]);
-    }
+      /* Backface culling. */
+      const float3 R = normalize(Q - P);
+      const float3 U = dradiusdu * R + dPdu;
+      const float3 V = cross(dPdu, R);
+      const float3 Ng = cross(V, U);
+      if (!use_backfacing && dot(ray_dir, Ng) > 0.0f) {
+        return false;
+      }
 
-    float r1 = r_st + (r_en - r_st) * i_st;
-    float r2 = r_st + (r_en - r_st) * i_en;
-    r_curr = max(r1, r2);
+      /* Record intersection. */
+      isect->t = t;
+      isect->u = u;
+      isect->v = 0.0f;
 
-    if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_curr ||
-        bmaxx < -r_curr || bminy > r_curr || bmaxy < -r_curr) {
-      /* the bounding box does not overlap the square centered at O */
-      tree += level;
-      level = tree & -tree;
+      return true;
     }
-    else if (level == 1) {
-
-      /* the maximum recursion depth is reached.
-       * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-       * dP* is reversed if necessary.*/
-      float t = isect->t;
-      float u = 0.0f;
-      float gd = 0.0f;
-
-      if (flags & CURVE_KN_RIBBONS) {
-        float3 tg = (p_en - p_st);
-#  ifdef __KERNEL_SSE__
-        const float3 tg_sq = tg * tg;
-        float w = tg_sq.x + tg_sq.y;
-#  else
-        float w = tg.x * tg.x + tg.y * tg.y;
-#  endif
-        if (w == 0) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
-#  ifdef __KERNEL_SSE__
-        const float3 p_sttg = p_st * tg;
-        w = -(p_sttg.x + p_sttg.y) / w;
+  }
+  return false;
+}
+
+ccl_device bool curve_intersect_recursive(const float3 ray_orig,
+                                          const float3 ray_dir,
+                                          float4 curve[4],
+                                          Intersection *isect)
+{
+  /* Move ray closer to make intersection stable. */
+  const float3 center = float4_to_float3(0.25f * (curve[0] + curve[1] + curve[2] + curve[3]));
+  const float dt = dot(center - ray_orig, ray_dir) / dot(ray_dir, ray_dir);
+  const float3 ref = ray_orig + ray_dir * dt;
+  const float4 ref4 = make_float4(ref.x, ref.y, ref.z, 0.0f);
+  curve[0] -= ref4;
+  curve[1] -= ref4;
+  curve[2] -= ref4;
+  curve[3] -= ref4;
+
+  const bool use_backfacing = false;
+  const float step_size = 1.0f / (float)(CURVE_NUM_BEZIER_STEPS);
+
+  int depth = 0;
+
+  /* todo: optimize stack for GPU somehow? Possibly some bitflags are enough, and
+   * u0/u1 can be derived from the depth. */
+  struct {
+    float u0, u1;
+    int i;
+  } stack[CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE];
+
+  bool found = false;
+
+  float u0 = 0.0f;
+  float u1 = 1.0f;
+  int i = 0;
+
+  while (1) {
+    for (; i < CURVE_NUM_BEZIER_STEPS; i++) {
+      const float step = i * step_size;
+
+      /* Subdivide curve. */
+      const float dscale = (u1 - u0) * (1.0f / 3.0f) * step_size;
+      const float vu0 = mix(u0, u1, step);
+      const float vu1 = mix(u0, u1, step + step_size);
+
+      const float4 P0 = catmull_rom_basis_eval(curve, vu0);
+      const float4 dP0du = dscale * catmull_rom_basis_derivative(curve, vu0);
+      const float4 P3 = catmull_rom_basis_eval(curve, vu1);
+      const float4 dP3du = dscale * catmull_rom_basis_derivative(curve, vu1);
+
+      const float4 P1 = P0 + dP0du;
+      const float4 P2 = P3 - dP3du;
+
+      /* Calculate bounding cylinders. */
+      const float rr1 = sqr_point_to_line_distance(float4_to_float3(dP0du),
+                                                   float4_to_float3(P3 - P0));
+      const float rr2 = sqr_point_to_line_distance(float4_to_float3(dP3du),
+                                                   float4_to_float3(P3 - P0));
+      const float maxr12 = sqrtf(max(rr1, rr2));
+      const float one_plus_ulp = 1.0f + 2.0f * FLT_EPSILON;
+      const float one_minus_ulp = 1.0f - 2.0f * FLT_EPSILON;
+      float r_outer = max(max(P0.w, P1.w), max(P2.w, P3.w)) + maxr12;
+      float r_inner = min(min(P0.w, P1.w), min(P2.w, P3.w)) - maxr12;
+      r_outer = one_plus_ulp * r_outer;
+      r_inner = max(0.0f, one_minus_ulp * r_inner);
+      bool valid = true;
+
+      /* Intersect with outer cylinder. */
+      float2 tc_outer;
+      float u_outer0, u_outer1;
+      float3 Ng_outer0, Ng_outer1;
+      valid = cylinder_intersect(float4_to_float3(P0),
+                                 float4_to_float3(P3),
+                                 r_outer,
+                                 ray_dir,
+                                 &tc_outer,
+                                 &u_outer0,
+                                 &Ng_outer0,
+                                 &u_outer1,
+                                 &Ng_outer1);
+      if (!valid) {
+        continue;
+      }
+
+      /* Intersect with cap-planes. */
+      float2 tp = make_float2(-dt, isect->t - dt);
+      tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
+      const float2 h0 = half_plane_intersect(
+          float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
+      tp = make_float2(max(tp.x, h0.x), min(tp.y, h0.y));
+      const float2 h1 = half_plane_intersect(
+          float4_to_float3(P3), -float4_to_float3(dP3du), ray_dir);
+      tp = make_float2(max(tp.x, h1.x), min(tp.y, h1.y));
+      valid = tp.x <= tp.y;
+      if (!valid) {
+        continue;
+      }
+
+      /* Clamp and correct u parameter. */
+      u_outer0 = clamp(u_outer0, 0.0f, 1.0f);
+      u_outer1 = clamp(u_outer1, 0.0f, 1.0f);
+      u_outer0 = mix(u0, u1, (step + u_outer0) * (1.0f / (float)(CURVE_NUM_BEZIER_STEPS + 1)));
+      u_outer1 = mix(u0, u1, (step + u_outer1) * (1.0f / (float)(CURVE_NUM_BEZIER_STEPS + 1)));
+
+      /* Intersect with inner cylinder. */
+      float2 tc_inner;
+      float u_inner0, u_inner1;
+      float3 Ng_inner0, Ng_inner1;
+      const bool valid_inner = cylinder_intersect(float4_to_float3(P0),
+                                                  float4_to_float3(P3),
+                                                  r_inner,
+                                                  ray_dir,
+                                                  &tc_inner,
+                                                  &u_inner0,
+                                                  &Ng_inner0,
+                                                  &u_inner1,
+                                                  &Ng_inner1);
+
+      /* At the unstable area we subdivide deeper. */
+#  if 0
+      const bool unstable0 = (!valid_inner) |
+                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner0))) < 0.3f);
+      const bool unstable1 = (!valid_inner) |
+                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner1))) < 0.3f);
 #  else
-        w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-#  endif
-        w = saturate(w);
-
-        /* compute u on the curve segment */
-        u = i_st * (1 - w) + i_en * w;
-        r_curr = r_st + (r_en - r_st) * u;
-        /* compare x-y distances */
-        float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u +
-                        curve_coef[0];
-
-        float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-        if (dot(tg, dp_st) < 0)
-          dp_st *= -1;
-        if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
-        float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-        if (dot(tg, dp_en) < 0)
-          dp_en *= -1;
-        if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
+      /* On the GPU appears to be a little faster if always enabled. */
+      (void)valid_inner;
 
-        if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_curr * r_curr || p_curr.z <= epsilon ||
-            isect->t < p_curr.z) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
+      const bool unstable0 = true;
+      const bool unstable1 = true;
+#  endif
 
-        t = p_curr.z;
+      /* Subtract the inner interval from the current hit interval. */
+      float2 tp0 = make_float2(tp.x, min(tp.y, tc_inner.x));
+      float2 tp1 = make_float2(max(tp.x, tc_inner.y), tp.y);
+      bool valid0 = valid && (tp0.x <= tp0.y);
+      bool valid1 = valid && (tp1.x <= tp1.y);
+      if (!(valid0 || valid1)) {
+        continue;
       }
-      else {
-        float l = len(p_en - p_st);
-        float invl = 1.0f / l;
-        float3 tg = (p_en - p_st) * invl;
-        gd = (r2 - r1) * invl;
-        float difz = -dot(p_st, tg);
-        float cyla = 1.0f - (tg.z * tg.z * (1 + gd * gd));
-        float invcyla = 1.0f / cyla;
-        float halfb = (-p_st.z - tg.z * (difz + gd * (difz * gd + r1)));
-        float tcentre = -halfb * invcyla;
-        float zcentre = difz + (tg.z * tcentre);
-        float3 tdif = -p_st;
-        tdif.z += tcentre;
-        float tdifz = dot(tdif, tg);
-        float tb = 2 * (tdif.z - tg.z * (tdifz + gd * (tdifz * gd + r1)));
-        float tc = dot(tdif, tdif) - tdifz * tdifz * (1 + gd * gd) - r1 * r1 - 2 * r1 * tdifz * gd;
-        float td = tb * tb - 4 * cyla * tc;
-        if (td < 0.0f) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
 
-        float rootd = sqrtf(td);
-        float correction = (-tb - rootd) * 0.5f * invcyla;
-        t = tcentre + correction;
-
-        float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-        if (dot(tg, dp_st) < 0)
-          dp_st *= -1;
-        float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-        if (dot(tg, dp_en) < 0)
-          dp_en *= -1;
-
-        if (flags & CURVE_KN_BACKFACING &&
-            (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 ||
-             isect->t < t || t <= 0.0f)) {
-          correction = (-tb + rootd) * 0.5f * invcyla;
-          t = tcentre + correction;
+      /* Process one or two hits. */
+      bool recurse = false;
+      if (valid0) {
+        const int termDepth = unstable0 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
+                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
+        if (depth >= termDepth) {
+          found |= curve_intersect_iterative(
+              ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
-
-        if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 ||
-            isect->t < t || t <= 0.0f) {
-          tree++;
-          level = tree & -tree;
-          continue;
+        else {
+          recurse = true;
         }
+      }
 
-        float w = (zcentre + (tg.z * correction)) * invl;
-        w = saturate(w);
-        /* compute u on the curve segment */
-        u = i_st * (1 - w) + i_en * w;
+      if (valid1 && (tp1.x + dt <= isect->t)) {
+        const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
+                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
+        if (depth >= termDepth) {
+          found |= curve_intersect_iterative(
+              ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+        }
+        else {
+          recurse = true;
+        }
       }
-      /* we found a new intersection */
 
-#  ifdef __VISIBILITY_FLAG__
-      /* visibility flag test. we do it here under the assumption
-       * that most triangles are culled by node flags */
-      if (kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#  endif
-      {
-        /* record intersection */
-        isect->t = t;
-        isect->u = u;
-        isect->v = gd;
-        isect->prim = curveAddr;
-        isect->object = object;
-        isect->type = type;
-        hit = true;
+      if (recurse) {
+        stack[depth].u0 = u0;
+        stack[depth].u1 = u1;
+        stack[depth].i = i + 1;
+        depth++;
+
+        u0 = vu0;
+        u1 = vu1;
+        i = -1;
       }
+    }
 
-      tree++;
-      level = tree & -tree;
+    if (depth > 0) {
+      depth--;
+      u0 = stack[depth].u0;
+      u1 = stack[depth].u1;
+      i = stack[depth].i;
     }
     else {
-      /* split the curve into two curves and process */
-      level = level >> 1;
+      break;
     }
   }
 
-  return hit;
+  return found;
+}
+
+/* Ribbons */
+
+ccl_device_inline bool cylinder_culling_test(const float2 p1, const float2 p2, const float r)
+{
+  /* Performs culling against a cylinder. */
+  const float2 dp = p2 - p1;
+  const float num = dp.x * p1.y - dp.y * p1.x;
+  const float den2 = dot(p2 - p1, p2 - p1);
+  return num * num <= r * r * den2;
+}
+
+/*! Intersects a ray with a quad with backface culling
+ *  enabled. The quad v0,v1,v2,v3 is split into two triangles
+ *  v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+ *  triangles gets intersected. */
+ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,
+                                             const float3 quad_v0,
+                                             const float3 quad_v1,
+                                             const float3 quad_v2,
+                                             const float3 quad_v3,
+                                             float *u_o,
+                                             float *v_o,
+                                             float *t_o)
+{
+  /* Calculate vertices relative to ray origin? */
+  const float3 O = make_float3(0.0f, 0.0f, 0.0f);
+  const float3 D = make_float3(0.0f, 0.0f, 1.0f);
+  const float3 va = quad_v0 - O;
+  const float3 vb = quad_v1 - O;
+  const float3 vc = quad_v2 - O;
+  const float3 vd = quad_v3 - O;
+
+  const float3 edb = vb - vd;
+  const float WW = dot(cross(vd, edb), D);
+  const float3 v0 = (WW <= 0.0f) ? va : vc;
+  const float3 v1 = (WW <= 0.0f) ? vb : vd;
+  const float3 v2 = (WW <= 0.0f) ? vd : vb;
+
+  /* Calculate edges? */
+  const float3 e0 = v2 - v0;
+  const float3 e1 = v0 - v1;
+
+  /* perform edge tests */
+  const float U = dot(cross(v0, e0), D);
+  const float V = dot(cross(v1, e1), D);
+  if (!(max(U, V) <= 0.0f)) {
+    return false;
+  }
+
+  /* Calculate geometry normal and denominator? */
+  const float3 Ng = cross(e1, e0);
+  const float den = dot(Ng, D);
+  const float rcpDen = 1.0f / den;
+
+  /* Perform depth test? */
+  const float t = rcpDen * dot(v0, Ng);
+  if (!(0.0f <= t && t <= ray_tfar)) {
+    return false;
+  }
+
+  /* Avoid division by 0? */
+  if (!(den != 0.0f)) {
+    return false;
+  }
+
+  /* Update hit information? */
+  *t_o = t;
+  *u_o = U * rcpDen;
+  *v_o = V * rcpDen;
+  *u_o = (WW <= 0.0f) ? *u_o : 1.0f - *u_o;
+  *v_o = (WW <= 0.0f) ? *v_o : 1.0f - *v_o;
+  return true;
+}
+
+ccl_device_inline void ribbon_ray_space(const float3 ray_dir, float3 ray_space[3])
+{
+  const float3 dx0 = make_float3(0, ray_dir.z, -ray_dir.y);
+  const float3 dx1 = make_float3(-ray_dir.z, 0, ray_dir.x);
+  ray_space[0] = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1);
+  ray_space[1] = normalize(cross(ray_dir, ray_space[0]));
+  ray_space[2] = ray_dir;
+}
+
+ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
+                                             const float3 ray_org,
+                                             const float4 P4)
+{
+  float3 P = float4_to_float3(P4) - ray_org;
+  return make_float4(dot(ray_space[0], P), dot(ray_space[1], P), dot(ray_space[2], P), P4.w);
+}
+
+ccl_device_inline bool ribbon_intersect(const float3 ray_org,
+                                        const float3 ray_dir,
+                                        const float ray_tfar,
+                                        const int N,
+                                        float4 curve[4],
+                                        Intersection *isect)
+{
+  /* Transform control points into ray space. */
+  float3 ray_space[3];
+  ribbon_ray_space(ray_dir, ray_space);
+
+  curve[0] = ribbon_to_ray_space(ray_space, ray_org, curve[0]);
+  curve[1] = ribbon_to_ray_space(ray_space, ray_org, curve[1]);
+  curve[2] = ribbon_to_ray_space(ray_space, ray_org, curve[2]);
+  curve[3] = ribbon_to_ray_space(ray_space, ray_org, curve[3]);
+
+  const float4 mx = max(max(fabs(curve[0]), fabs(curve[1])), max(fabs(curve[2]), fabs(curve[3])));
+  const float eps = 4.0f * FLT_EPSILON * max(max(mx.x, mx.y), max(mx.z, mx.w));
+  const float step_size = 1.0f / (float)N;
+
+  /* Evaluate first point and radius scaled normal direction. */
+  float4 p0 = catmull_rom_basis_eval(curve, 0.0f);
+  float3 dp0dt = float4_to_float3(catmull_rom_basis_derivative(curve, 0.0f));
+  if (max3(fabs(dp0dt)) < eps) {
+    const float4 p1 = catmull_rom_basis_eval(curve, step_size);
+    dp0dt = float4_to_float3(p1 - p0);
+  }
+  float3 wn0 = normalize(make_float3(dp0dt.y, -dp0dt.x, 0.0f)) * p0.w;
+
+  /* Evaluate the bezier curve. */
+  for (int i = 0; i < N; i++) {
+    const float u = i * step_size;
+    const float4 p1 = catmull_rom_basis_eval(curve, u + step_size);
+    bool valid = cylinder_culling_test(
+        make_float2(p0.x, p0.y), make_float2(p1.x, p1.y), max(p0.w, p1.w));
+    if (!valid) {
+      continue;
+    }
+
+    /* Evaluate next point. */
+    float3 dp1dt = float4_to_float3(catmull_rom_basis_derivative(curve, u + step_size));
+    dp1dt = (max3(fabs(dp1dt)) < eps) ? float4_to_float3(p1 - p0) : dp1dt;
+    const float3 wn1 = normalize(make_float3(dp1dt.y, -dp1dt.x, 0.0f)) * p1.w;
+
+    /* Construct quad coordinates. */
+    const float3 lp0 = float4_to_float3(p0) + wn0;
+    const float3 lp1 = float4_to_float3(p1) + wn1;
+    const float3 up0 = float4_to_float3(p0) - wn0;
+    const float3 up1 = float4_to_float3(p1) - wn1;
+
+    /* Intersect quad. */
+    float vu, vv, vt;
+    bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+
+    if (valid0) {
+      /* ignore self intersections */
+      const float avoidance_factor = 2.0f;
+      if (avoidance_factor != 0.0f) {
+        float r = mix(p0.w, p1.w, vu);
+        valid0 = vt > avoidance_factor * r;
+      }
+
+      if (valid0) {
+        vv = 2.0f * vv - 1.0f;
+
+        /* Record intersection. */
+        isect->t = vt;
+        isect->u = u + vu * step_size;
+        isect->v = vv;
+        return true;
+      }
+    }
+
+    p0 = p1;
+    wn0 = wn1;
+  }
+  return false;
 }
 
 ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
                                             Intersection *isect,
-                                            float3 P,
-                                            float3 direction,
+                                            const float3 P,
+                                            const float3 dir,
                                             uint visibility,
                                             int object,
                                             int curveAddr,
                                             float time,
                                             int type)
 {
-  /* define few macros to minimize code duplication for SSE */
-#  ifndef __KERNEL_SSE2__
-#    define len3_squared(x) len_squared(x)
-#    define len3(x) len(x)
-#    define dot3(x, y) dot(x, y)
-#  endif
-
-  const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+  const bool is_motion = (type & PRIMITIVE_ALL_MOTION);
 
-#  ifndef __KERNEL_OPTIX__ /* see OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */
-  if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+#  ifndef __KERNEL_OPTIX__ /* See OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */
+  if (is_motion && kernel_data.bvh.use_bvh_steps) {
     const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
     if (time < prim_time.x || time > prim_time.y) {
       return false;
@@ -517,210 +639,63 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
 #  endif
 
   int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-  /* curve Intersection check */
-  int flags = kernel_data.curve.curveflags;
-
   int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
   float4 v00 = kernel_tex_fetch(__curves, prim);
 
-  int cnum = __float_as_int(v00.x);
-  int k0 = cnum + segment;
+  int k0 = __float_as_int(v00.x) + segment;
   int k1 = k0 + 1;
 
-#  ifndef __KERNEL_SSE2__
-  float4 P_curve[2];
+  int ka = max(k0 - 1, __float_as_int(v00.x));
+  int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
 
-  if (is_curve_primitive) {
-    P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-    P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+  float4 curve[4];
+  if (!is_motion) {
+    curve[0] = kernel_tex_fetch(__curve_keys, ka);
+    curve[1] = kernel_tex_fetch(__curve_keys, k0);
+    curve[2] = kernel_tex_fetch(__curve_keys, k1);
+    curve[3] = kernel_tex_fetch(__curve_keys, kb);
   }
   else {
     int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-    motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
-  }
-
-  float r1 = P_curve[0].w;
-  float r2 = P_curve[1].w;
-  float3 p1 = float4_to_float3(P_curve[0]);
-  float3 p2 = float4_to_float3(P_curve[1]);
-
-  /* minimum width extension */
-  float3 dif = P - p1;
-  float3 dif_second = P - p2;
-
-  float3 p21_diff = p2 - p1;
-  float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-  float3 dir = direction;
-  float sphere_b_tmp = dot3(dir, sphere_dif1);
-  float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#  else
-  ssef P_curve[2];
-
-  if (is_curve_primitive) {
-    P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
-    P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+    motion_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, curve);
   }
-  else {
-    int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-    motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4 *)&P_curve);
-  }
-
-  ssef r12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-  const ssef vP = load4f(P);
-  const ssef dif = vP - P_curve[0];
-  const ssef dif_second = vP - P_curve[1];
-  float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
-  const ssef p21_diff = P_curve[1] - P_curve[0];
-  const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
-  const ssef dir = load4f(direction);
-  const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-  const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#  endif
-
-  float mr = max(r1, r2);
-  float l = len3(p21_diff);
-  float invl = 1.0f / l;
-  float sp_r = mr + 0.5f * l;
 
-  float sphere_b = dot3(dir, sphere_dif2);
-  float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-  if (sdisc < 0.0f)
-    return false;
-
-    /* obtain parameters and test midpoint distance for suitable modes */
-#  ifndef __KERNEL_SSE2__
-  float3 tg = p21_diff * invl;
-#  else
-  const ssef tg = p21_diff * invl;
-#  endif
-  float gd = (r2 - r1) * invl;
-
-  float dirz = dot3(dir, tg);
-  float difz = dot3(dif, tg);
-
-  float a = 1.0f - (dirz * dirz * (1 + gd * gd));
-
-  float halfb = dot3(dir, dif) - dirz * (difz + gd * (difz * gd + r1));
-
-  float tcentre = -halfb / a;
-  float zcentre = difz + (dirz * tcentre);
-
-  if ((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-    return false;
-  if ((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) &&
-      !(flags & CURVE_KN_INTERSECTCORRECTION))
+#  ifdef __VISIBILITY_FLAG__
+  if (!(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)) {
     return false;
-
-    /* test minimum separation */
-#  ifndef __KERNEL_SSE2__
-  float3 cprod = cross(tg, dir);
-  float cprod2sq = len3_squared(cross(tg, dif));
-#  else
-  const ssef cprod = cross(tg, dir);
-  float cprod2sq = len3_squared(cross_zxy(tg, dif));
+  }
 #  endif
-  float cprodsq = len3_squared(cprod);
-  float distscaled = dot3(cprod, dif);
-
-  if (cprodsq == 0)
-    distscaled = cprod2sq;
-  else
-    distscaled = (distscaled * distscaled) / cprodsq;
-
-  if (distscaled > mr * mr)
-    return false;
 
-    /* calculate true intersection */
-#  ifndef __KERNEL_SSE2__
-  float3 tdif = dif + tcentre * dir;
-#  else
-  const ssef tdif = madd(ssef(tcentre), dir, dif);
-#  endif
-  float tdifz = dot3(tdif, tg);
-  float tdifma = tdifz * gd + r1;
-  float tb = 2 * (dot3(dir, tdif) - dirz * (tdifz + gd * tdifma));
-  float tc = dot3(tdif, tdif) - tdifz * tdifz - tdifma * tdifma;
-  float td = tb * tb - 4 * a * tc;
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    /* todo: adaptive number of subdivisions could help performance here. */
+    const int subdivisions = kernel_data.bvh.curve_subdivisions;
+    if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+      isect->prim = curveAddr;
+      isect->object = object;
+      isect->type = type;
+      return true;
+    }
 
-  if (td < 0.0f)
     return false;
-
-  float rootd = 0.0f;
-  float correction = 0.0f;
-  if (flags & CURVE_KN_ACCURATE) {
-    rootd = sqrtf(td);
-    correction = ((-tb - rootd) / (2 * a));
   }
-
-  float t = tcentre + correction;
-
-  if (t < isect->t) {
-
-    if (flags & CURVE_KN_INTERSECTCORRECTION) {
-      rootd = sqrtf(td);
-      correction = ((-tb - rootd) / (2 * a));
-      t = tcentre + correction;
-    }
-
-    float z = zcentre + (dirz * correction);
-    // bool backface = false;
-
-    if (flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-      // backface = true;
-      correction = ((-tb + rootd) / (2 * a));
-      t = tcentre + correction;
-      z = zcentre + (dirz * correction);
+  else {
+    if (curve_intersect_recursive(P, dir, curve, isect)) {
+      isect->prim = curveAddr;
+      isect->object = object;
+      isect->type = type;
+      return true;
     }
 
-    if (t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-      if (flags & CURVE_KN_ENCLOSEFILTER) {
-        float enc_ratio = 1.01f;
-        if ((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-          float a2 = 1.0f - (dirz * dirz * (1 + gd * gd * enc_ratio * enc_ratio));
-          float c2 = dot3(dif, dif) - difz * difz * (1 + gd * gd * enc_ratio * enc_ratio) -
-                     r1 * r1 * enc_ratio * enc_ratio - 2 * r1 * difz * gd * enc_ratio;
-          if (a2 * c2 < 0.0f)
-            return false;
-        }
-      }
-
-#  ifdef __VISIBILITY_FLAG__
-      /* visibility flag test. we do it here under the assumption
-       * that most triangles are culled by node flags */
-      if (kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#  endif
-      {
-        /* record intersection */
-        isect->t = t;
-        isect->u = z * invl;
-        isect->v = gd;
-        isect->prim = curveAddr;
-        isect->object = object;
-        isect->type = type;
-
-        return true;
-      }
-    }
+    return false;
   }
-
-  return false;
-
-#  ifndef __KERNEL_SSE2__
-#    undef len3_squared
-#    undef len3
-#    undef dot3
-#  endif
 }
 
-ccl_device_inline float3 curve_refine(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      const Intersection *isect,
-                                      const Ray *ray)
+ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          const Intersection *isect,
+                                          const Ray *ray)
 {
-  int flag = kernel_data.curve.curveflags;
   float t = isect->t;
   float3 P = ray->P;
   float3 D = ray->D;
@@ -743,118 +718,60 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
   int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
   int k1 = k0 + 1;
 
-  float3 tg;
+  int ka = max(k0 - 1, __float_as_int(v00.x));
+  int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
 
-  if (flag & CURVE_KN_INTERPOLATE) {
-    int ka = max(k0 - 1, __float_as_int(v00.x));
-    int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+  float4 P_curve[4];
 
-    float4 P_curve[4];
+  if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
+    P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+    P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+    P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+    P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+  }
+  else {
+    motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+  }
 
-    if (sd->type & PRIMITIVE_CURVE) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-      P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-    }
-    else {
-      motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
-    }
+  sd->u = isect->u;
+  sd->v = isect->v;
 
-    float3 p[4];
-    p[0] = float4_to_float3(P_curve[0]);
-    p[1] = float4_to_float3(P_curve[1]);
-    p[2] = float4_to_float3(P_curve[2]);
-    p[3] = float4_to_float3(P_curve[3]);
+  P = P + D * t;
 
-    P = P + D * t;
+  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+  const float3 dPdu = float4_to_float3(dPdu4);
 
-#  ifdef __UV__
-    sd->u = isect->u;
-    sd->v = 0.0f;
-#  endif
+  if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
+    const float3 tangent = normalize(dPdu);
+    const float3 bitangent = normalize(cross(tangent, -D));
+    const float sine = isect->v;
+    const float cosine = safe_sqrtf(1.0f - sine * sine);
 
-    tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+    sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
+    sd->Ng = -D;
 
-    if (kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-      sd->Ng = normalize(-(D - tg * (dot(tg, D))));
-    }
-    else {
-#  ifdef __EMBREE__
-      if (kernel_data.bvh.scene) {
-        sd->Ng = normalize(isect->Ng);
-      }
-      else
+#  if 0
+    /* This approximates the position and geometric normal of a thick curve too,
+     * but gives too many issues with wrong self intersections. */
+    const float dPdu_radius = dPdu4.w;
+    sd->Ng = sd->N;
+    P += sd->N * dPdu_radius;
 #  endif
-      {
-        /* direction from inside to surface of curve */
-        float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
-        sd->Ng = normalize(P - p_curr);
-
-        /* adjustment for changing radius */
-        float gd = isect->v;
-
-        if (gd != 0.0f) {
-          sd->Ng = sd->Ng - gd * tg;
-          sd->Ng = normalize(sd->Ng);
-        }
-      }
-    }
-
-    /* todo: sometimes the normal is still so that this is detected as
-     * backfacing even if cull backfaces is enabled */
-
-    sd->N = sd->Ng;
   }
   else {
-    float4 P_curve[2];
-
-    if (sd->type & PRIMITIVE_CURVE) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-    }
-    else {
-      motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
-    }
-
-    float l = 1.0f;
-    tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-
-    P = P + D * t;
-
-    float3 dif = P - float4_to_float3(P_curve[0]);
-
-#  ifdef __UV__
-    sd->u = dot(dif, tg) / l;
-    sd->v = 0.0f;
-#  endif
-
-    if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-      sd->Ng = -(D - tg * dot(tg, D));
-      sd->Ng = normalize(sd->Ng);
-    }
-    else {
-      float gd = isect->v;
-
-      /* direction from inside to surface of curve */
-      float denom = fmaxf(P_curve[0].w + sd->u * l * gd, 1e-8f);
-      sd->Ng = (dif - tg * sd->u * l) / denom;
-
-      /* adjustment for changing radius */
-      if (gd != 0.0f) {
-        sd->Ng = sd->Ng - gd * tg;
-      }
-
-      sd->Ng = normalize(sd->Ng);
-    }
-
+    /* Thick curves, compute normal using direction from inside the curve.
+     * This could be optimized by recording the normal in the intersection,
+     * however for Optix this would go beyond the size of the payload. */
+    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+    sd->Ng = normalize(P - P_inside);
     sd->N = sd->Ng;
   }
 
 #  ifdef __DPDU__
   /* dPdu/dPdv */
-  sd->dPdu = tg;
-  sd->dPdv = cross(tg, sd->Ng);
+  sd->dPdu = dPdu;
+  sd->dPdv = cross(dPdu, sd->Ng);
 #  endif
 
   if (isect->object != OBJECT_NONE) {
@@ -867,7 +784,10 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
     P = transform_point(&tfm, P);
   }
 
-  return P;
+  sd->P = P;
+
+  float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+  sd->shader = __float_as_int(curvedata.z);
 }
 
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 0e2a00e9d2e..0f66f4af755 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -50,14 +50,14 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
-                                                  int offset,
-                                                  int numkeys,
-                                                  int numsteps,
-                                                  int step,
-                                                  int k0,
-                                                  int k1,
-                                                  float4 keys[2])
+ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+                                                         int offset,
+                                                         int numkeys,
+                                                         int numsteps,
+                                                         int step,
+                                                         int k0,
+                                                         int k1,
+                                                         float4 keys[2])
 {
   if (step == numsteps) {
     /* center step: regular key location */
@@ -77,7 +77,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(
+ccl_device_inline void motion_curve_keys_linear(
     KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
 {
   /* get motion info */
@@ -97,24 +97,24 @@ ccl_device_inline void motion_curve_keys(
   /* fetch key coordinates */
   float4 next_keys[2];
 
-  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, keys);
-  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step + 1, k0, k1, next_keys);
+  motion_curve_keys_for_step_linear(kg, offset, numkeys, numsteps, step, k0, k1, keys);
+  motion_curve_keys_for_step_linear(kg, offset, numkeys, numsteps, step + 1, k0, k1, next_keys);
 
   /* interpolate between steps */
   keys[0] = (1.0f - t) * keys[0] + t * next_keys[0];
   keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
 }
 
-ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg,
-                                                           int offset,
-                                                           int numkeys,
-                                                           int numsteps,
-                                                           int step,
-                                                           int k0,
-                                                           int k1,
-                                                           int k2,
-                                                           int k3,
-                                                           float4 keys[4])
+ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+                                                  int offset,
+                                                  int numkeys,
+                                                  int numsteps,
+                                                  int step,
+                                                  int k0,
+                                                  int k1,
+                                                  int k2,
+                                                  int k3,
+                                                  float4 keys[4])
 {
   if (step == numsteps) {
     /* center step: regular key location */
@@ -138,15 +138,15 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
-                                                  int object,
-                                                  int prim,
-                                                  float time,
-                                                  int k0,
-                                                  int k1,
-                                                  int k2,
-                                                  int k3,
-                                                  float4 keys[4])
+ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+                                         int object,
+                                         int prim,
+                                         float time,
+                                         int k0,
+                                         int k1,
+                                         int k2,
+                                         int k3,
+                                         float4 keys[4])
 {
   /* get motion info */
   int numsteps, numkeys;
@@ -165,9 +165,8 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
   /* fetch key coordinates */
   float4 next_keys[4];
 
-  motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
-  motion_cardinal_curve_keys_for_step(
-      kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys);
+  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
+  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys);
 
   /* interpolate between steps */
   keys[0] = (1.0f - t) * keys[0] + t * next_keys[0];
@@ -176,53 +175,6 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
   keys[3] = (1.0f - t) * keys[3] + t * next_keys[3];
 }
 
-#  if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-/* Similar to above, but returns keys as pair of two AVX registers with each
- * holding two float4.
- */
-ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg,
-                                                      int object,
-                                                      int prim,
-                                                      float time,
-                                                      int k0,
-                                                      int k1,
-                                                      int k2,
-                                                      int k3,
-                                                      avxf *out_keys_0_1,
-                                                      avxf *out_keys_2_3)
-{
-  /* Get motion info. */
-  int numsteps, numkeys;
-  object_motion_info(kg, object, &numsteps, NULL, &numkeys);
-
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  int maxstep = numsteps * 2;
-  int step = min((int)(time * maxstep), maxstep - 1);
-  float t = time * maxstep - step;
-
-  /* Find attribute. */
-  AttributeElement elem;
-  int offset = find_attribute_curve_motion(kg, object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
-  kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-  /* Fetch key coordinates. */
-  float4 next_keys[4];
-  float4 keys[4];
-  motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
-  motion_cardinal_curve_keys_for_step(
-      kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys);
-
-  const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128);
-  const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128);
-  const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128);
-  const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128);
-
-  /* Interpolate between steps. */
-  *out_keys_0_1 = (1.0f - t) * keys_0_1 + t * next_keys_0_1;
-  *out_keys_2_3 = (1.0f - t) * keys_2_3 + t * next_keys_2_3;
-}
-#  endif
-
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 3aa68e1f84e..614e2e3b92b 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -411,25 +411,10 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 
 ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 {
-  /* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse
-   * direction */
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  const ssef oopes(8.271806E-25f, 8.271806E-25f, 8.271806E-25f, 0.0f);
-  const ssef mask = _mm_cmpgt_ps(fabs(dir), oopes);
-  const ssef signdir = signmsk(dir.m128) | oopes;
-#  ifndef __KERNEL_AVX__
-  ssef res = mask & ssef(dir);
-  res = _mm_or_ps(res, _mm_andnot_ps(mask, signdir));
-#  else
-  ssef res = _mm_blendv_ps(signdir, dir, mask);
-#  endif
-  return float3(res);
-#else  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
   const float ooeps = 8.271806E-25f;
   return make_float3((fabsf(dir.x) > ooeps) ? dir.x : copysignf(ooeps, dir.x),
                      (fabsf(dir.y) > ooeps) ? dir.y : copysignf(ooeps, dir.y),
                      (fabsf(dir.z) > ooeps) ? dir.z : copysignf(ooeps, dir.z));
-#endif /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
 }
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
@@ -457,38 +442,6 @@ ccl_device_inline float bvh_instance_push(
   return t;
 }
 
-#ifdef __QBVH__
-/* Same as above, but optimized for QBVH scene intersection,
- * which needs to modify two max distances.
- *
- * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
- * so we can avoid having this duplication.
- */
-ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
-                                          int object,
-                                          const Ray *ray,
-                                          float3 *P,
-                                          float3 *dir,
-                                          float3 *idir,
-                                          float *t,
-                                          float *t1)
-{
-  Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-
-  *P = transform_point(&tfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  if (*t != FLT_MAX)
-    *t *= len;
-
-  if (*t1 != -FLT_MAX)
-    *t1 *= len;
-}
-#endif
-
 /* Transorm ray to exit static object in BVH */
 
 ccl_device_inline float bvh_instance_pop(
@@ -551,39 +504,6 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
   return t;
 }
 
-#  ifdef __QBVH__
-/* Same as above, but optimized for QBVH scene intersection,
- * which needs to modify two max distances.
- *
- * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
- * so we can avoid having this duplication.
- */
-ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg,
-                                                 int object,
-                                                 const Ray *ray,
-                                                 float3 *P,
-                                                 float3 *dir,
-                                                 float3 *idir,
-                                                 float *t,
-                                                 float *t1,
-                                                 Transform *itfm)
-{
-  object_fetch_transform_motion_test(kg, object, ray->time, itfm);
-
-  *P = transform_point(itfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  if (*t != FLT_MAX)
-    *t *= len;
-
-  if (*t1 != -FLT_MAX)
-    *t1 *= len;
-}
-#  endif
-
 /* Transorm ray to exit motion blurred object in BVH */
 
 ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 9a91da79f58..997abf438d0 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -174,6 +174,11 @@ ccl_device_inline float4 primitive_attribute_float4(KernelGlobals *kg,
     else
       return subd_triangle_attribute_float4(kg, sd, desc, dx, dy);
   }
+#ifdef __HAIR__
+  else if (sd->type & PRIMITIVE_ALL_CURVE) {
+    return curve_attribute_float4(kg, sd, desc, dx, dy);
+  }
+#endif
   else {
     if (dx)
       *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index a2731bf2bd0..0278f3ade8e 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -312,12 +312,21 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
                                             float4 *dx,
                                             float4 *dy)
 {
-  if (desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-    int tri = desc.offset + sd->prim * 3;
-
-    float4 f0 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 0));
-    float4 f1 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 1));
-    float4 f2 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 2));
+  if (desc.element == ATTR_ELEMENT_CORNER_BYTE || desc.element == ATTR_ELEMENT_VERTEX) {
+    float4 f0, f1, f2;
+
+    if (desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+      int tri = desc.offset + sd->prim * 3;
+      f0 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 0));
+      f1 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 1));
+      f2 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 2));
+    }
+    else {
+      uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+    }
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 6604806f73b..b0cce274b94 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -71,433 +71,6 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
   return false;
 }
 
-#ifdef __KERNEL_AVX2__
-#  define cross256(A, B, C, D) _mm256_fmsub_ps(A, B, _mm256_mul_ps(C, D))
-ccl_device_inline int ray_triangle_intersect8(KernelGlobals *kg,
-                                              float3 ray_P,
-                                              float3 ray_dir,
-                                              Intersection **isect,
-                                              uint visibility,
-                                              int object,
-                                              __m256 *triA,
-                                              __m256 *triB,
-                                              __m256 *triC,
-                                              int prim_addr,
-                                              int prim_num,
-                                              uint *num_hits,
-                                              uint max_hits,
-                                              int *num_hits_in_instance,
-                                              float isect_t)
-{
-
-  const unsigned char prim_num_mask = (1 << prim_num) - 1;
-
-  const __m256i zero256 = _mm256_setzero_si256();
-
-  const __m256 Px256 = _mm256_set1_ps(ray_P.x);
-  const __m256 Py256 = _mm256_set1_ps(ray_P.y);
-  const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
-
-  const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
-  const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
-  const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
-
-  /* Calculate vertices relative to ray origin. */
-  __m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
-  __m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
-  __m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
-
-  __m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
-  __m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
-  __m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
-
-  __m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
-  __m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
-  __m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
-
-  __m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
-  __m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
-  __m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
-
-  __m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
-  __m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
-  __m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
-
-  __m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
-  __m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
-  __m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
-
-  /* Calculate triangle edges. */
-  __m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
-  __m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
-  __m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
-
-  __m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
-  __m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
-  __m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
-
-  __m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
-  __m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
-  __m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
-
-  /* Perform edge tests. */
-  /* cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx) */
-  __m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
-  __m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
-  __m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
-  /* vertical dot */
-  __m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
-  U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256);
-  U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256);
-
-  __m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
-  __m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
-  __m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
-  /* vertical dot */
-  __m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
-  V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);
-  V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);
-
-  __m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
-  __m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
-  __m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
-  /* vertical dot */
-  __m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
-  W_256 = _mm256_fmadd_ps(W_y_256, diry256, W_256);
-  W_256 = _mm256_fmadd_ps(W_z_256, dirz256, W_256);
-
-  __m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
-  __m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
-  __m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
-  __m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
-
-  const __m256i one256 = _mm256_set1_epi32(1);
-  const __m256i two256 = _mm256_set1_epi32(2);
-
-  __m256i mask_minmaxUVW_256 = _mm256_or_si256(_mm256_cmpeq_epi32(one256, UVW_256_1),
-                                               _mm256_cmpeq_epi32(two256, UVW_256_1));
-
-  unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
-  if ((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) {  // all bits set
-    return false;
-  }
-
-  /* Calculate geometry normal and denominator. */
-  __m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
-  __m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
-  __m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
-
-  Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
-  Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
-  Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
-
-  /* vertical dot */
-  __m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
-  den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256, den_256);
-  den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256, den_256);
-
-  /* Perform depth test. */
-  __m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
-  T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256, T_256);
-  T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256, T_256);
-
-  const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
-  __m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
-
-  __m256 sign_T_256 = _mm256_castsi256_ps(
-      _mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
-
-  unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
-  if (((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
-    return false;
-  }
-
-  __m256 xor_signmask_256 = _mm256_castsi256_ps(
-      _mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
-
-  ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
-  ccl_align(32) unsigned int mask_minmaxUVW8[8];
-
-  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-    __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);
-    __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
-    __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
-    __m256 rayt_256 = _mm256_set1_ps((*isect)->t);
-    __m256i mask1 = _mm256_cmpgt_epi32(
-        _mm256_castps_si256(sign_T_256),
-        _mm256_castps_si256(_mm256_mul_ps(
-            _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)),
-            rayt_256)));
-    mask0 = _mm256_or_si256(mask1, mask0);
-    mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256);  //(~mask_minmaxUVW_pos) &(~mask)
-    mask_final_256 = _mm256_andnot_si256(
-        maskden256, mask_final_256);  //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
-    int mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
-    if ((mask_final & prim_num_mask) == 0) {
-      return false;
-    }
-    while (mask_final != 0) {
-      const int i = __bscf(mask_final);
-      if (i >= prim_num) {
-        return false;
-      }
-#  ifdef __VISIBILITY_FLAG__
-      if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
-        continue;
-      }
-#  endif
-      __m256 inv_den_256 = _mm256_rcp_ps(den_256);
-      U_256 = _mm256_mul_ps(U_256, inv_den_256);
-      V_256 = _mm256_mul_ps(V_256, inv_den_256);
-      T_256 = _mm256_mul_ps(T_256, inv_den_256);
-      _mm256_store_ps(U8, U_256);
-      _mm256_store_ps(V8, V_256);
-      _mm256_store_ps(T8, T_256);
-      (*isect)->u = U8[i];
-      (*isect)->v = V8[i];
-      (*isect)->t = T8[i];
-      (*isect)->prim = (prim_addr + i);
-      (*isect)->object = object;
-      (*isect)->type = PRIMITIVE_TRIANGLE;
-      return true;
-    }
-    return false;
-  }
-  else {
-    _mm256_store_ps(den8, den_256);
-    _mm256_store_ps(U8, U_256);
-    _mm256_store_ps(V8, V_256);
-    _mm256_store_ps(T8, T_256);
-
-    _mm256_store_ps(sign_T8, sign_T_256);
-    _mm256_store_ps(xor_signmask8, xor_signmask_256);
-    _mm256_store_si256((__m256i *)mask_minmaxUVW8, mask_minmaxUVW_256);
-
-    int ret = false;
-
-    if (visibility == PATH_RAY_SHADOW) {
-      for (int i = 0; i < prim_num; i++) {
-        if (mask_minmaxUVW8[i]) {
-          continue;
-        }
-#  ifdef __VISIBILITY_FLAG__
-        if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
-          continue;
-        }
-#  endif
-        if ((sign_T8[i] < 0.0f) || (sign_T8[i] > (*isect)->t * xor_signmask8[i])) {
-          continue;
-        }
-        if (!den8[i]) {
-          continue;
-        }
-        const float inv_den = 1.0f / den8[i];
-        (*isect)->u = U8[i] * inv_den;
-        (*isect)->v = V8[i] * inv_den;
-        (*isect)->t = T8[i] * inv_den;
-        (*isect)->prim = (prim_addr + i);
-        (*isect)->object = object;
-        (*isect)->type = PRIMITIVE_TRIANGLE;
-        const int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
-        int shader = 0;
-#  ifdef __HAIR__
-        if (kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
-#  endif
-        {
-          shader = kernel_tex_fetch(__tri_shader, prim);
-        }
-#  ifdef __HAIR__
-        else {
-          float4 str = kernel_tex_fetch(__curves, prim);
-          shader = __float_as_int(str.z);
-        }
-#  endif
-        const int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-        /* If no transparent shadows, all light is blocked. */
-        if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-          return 2;
-        }
-        /* If maximum number of hits reached, block all light. */
-        else if (num_hits == NULL || *num_hits == max_hits) {
-          return 2;
-        }
-        /* Move on to next entry in intersections array. */
-        ret = true;
-        (*isect)++;
-        (*num_hits)++;
-        (*num_hits_in_instance)++;
-        (*isect)->t = isect_t;
-      }
-    }
-    else {
-      for (int i = 0; i < prim_num; i++) {
-        if (mask_minmaxUVW8[i]) {
-          continue;
-        }
-#  ifdef __VISIBILITY_FLAG__
-        if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
-          continue;
-        }
-#  endif
-        if ((sign_T8[i] < 0.0f) || (sign_T8[i] > (*isect)->t * xor_signmask8[i])) {
-          continue;
-        }
-        if (!den8[i]) {
-          continue;
-        }
-        const float inv_den = 1.0f / den8[i];
-        (*isect)->u = U8[i] * inv_den;
-        (*isect)->v = V8[i] * inv_den;
-        (*isect)->t = T8[i] * inv_den;
-        (*isect)->prim = (prim_addr + i);
-        (*isect)->object = object;
-        (*isect)->type = PRIMITIVE_TRIANGLE;
-        ret = true;
-      }
-    }
-    return ret;
-  }
-}
-
-ccl_device_inline int triangle_intersect8(KernelGlobals *kg,
-                                          Intersection **isect,
-                                          float3 P,
-                                          float3 dir,
-                                          uint visibility,
-                                          int object,
-                                          int prim_addr,
-                                          int prim_num,
-                                          uint *num_hits,
-                                          uint max_hits,
-                                          int *num_hits_in_instance,
-                                          float isect_t)
-{
-  __m128 tri_a[8], tri_b[8], tri_c[8];
-  __m256 tritmp[12], tri[12];
-  __m256 triA[3], triB[3], triC[3];
-
-  int i, r;
-
-  uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-  for (i = 0; i < prim_num; i++) {
-    tri_a[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++];
-    tri_b[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++];
-    tri_c[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++];
-  }
-  // create 9 or  12 placeholders
-  tri[0] = _mm256_castps128_ps256(tri_a[0]);  //_mm256_zextps128_ps256
-  tri[1] = _mm256_castps128_ps256(tri_b[0]);  //_mm256_zextps128_ps256
-  tri[2] = _mm256_castps128_ps256(tri_c[0]);  //_mm256_zextps128_ps256
-
-  tri[3] = _mm256_castps128_ps256(tri_a[1]);  //_mm256_zextps128_ps256
-  tri[4] = _mm256_castps128_ps256(tri_b[1]);  //_mm256_zextps128_ps256
-  tri[5] = _mm256_castps128_ps256(tri_c[1]);  //_mm256_zextps128_ps256
-
-  tri[6] = _mm256_castps128_ps256(tri_a[2]);  //_mm256_zextps128_ps256
-  tri[7] = _mm256_castps128_ps256(tri_b[2]);  //_mm256_zextps128_ps256
-  tri[8] = _mm256_castps128_ps256(tri_c[2]);  //_mm256_zextps128_ps256
-
-  if (prim_num > 3) {
-    tri[9] = _mm256_castps128_ps256(tri_a[3]);   //_mm256_zextps128_ps256
-    tri[10] = _mm256_castps128_ps256(tri_b[3]);  //_mm256_zextps128_ps256
-    tri[11] = _mm256_castps128_ps256(tri_c[3]);  //_mm256_zextps128_ps256
-  }
-
-  for (i = 4, r = 0; i < prim_num; i++, r += 3) {
-    tri[r] = _mm256_insertf128_ps(tri[r], tri_a[i], 1);
-    tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
-    tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
-  }
-
-  //------------------------------------------------
-  // 0!  Xa0 Ya0 Za0 1 Xa4 Ya4 Za4  1
-  // 1!  Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
-  // 2!  Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
-
-  // 3!  Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
-  // 4!  Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5  1
-  // 5!  Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
-
-  // 6!  Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
-  // 7!  Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6  1
-  // 8!  Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
-
-  // 9!  Xa3 Ya3 Za3 1 Xa7 Ya7 Za7  1
-  // 10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7  1
-  // 11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7  1
-
-  //"transpose"
-  tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]);  // 0!  Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
-  tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]);  // 1!  Za0 Za1 1   1   Za4 Za5  1   1
-
-  tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]);  // 2!  Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
-  tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]);  // 3!  Za2 Za3  1   1  Za6 Za7  1   1
-
-  tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]);  // 4!  Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
-  tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]);  // 5!  Zb0 Zb1  1  1   Zb4 Zb5  1   1
-
-  tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]);  // 6!  Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
-  tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]);  // 7!  Zb2 Zb3  1    1 Zb6 Zb7  1   1
-
-  tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]);  // 8!  Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
-  tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]);  // 9!  Zc0 Zc1  1   1  Zc4 Zc5  1   1
-
-  tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]);  // 10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
-  tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]);  // 11! Zc2 Zc3  1   1  Zc6 Zc7  1   1
-
-  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
-  triA[0] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]),
-                         _mm256_castps_pd(tritmp[2])));  //  Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
-  triA[1] = _mm256_castpd_ps(
-      _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]),
-                         _mm256_castps_pd(tritmp[2])));  //  Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
-  triA[2] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]),
-                         _mm256_castps_pd(tritmp[3])));  //  Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
-
-  triB[0] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]),
-                         _mm256_castps_pd(tritmp[6])));  //  Xb0 Xb1  Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
-  triB[1] = _mm256_castpd_ps(
-      _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]),
-                         _mm256_castps_pd(tritmp[6])));  //  Yb0 Yb1  Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
-  triB[2] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]),
-                         _mm256_castps_pd(tritmp[7])));  //    Zb0 Zb1  Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
-
-  triC[0] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]),
-                         _mm256_castps_pd(tritmp[10])));  // Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
-  triC[1] = _mm256_castpd_ps(
-      _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]),
-                         _mm256_castps_pd(tritmp[10])));  // Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
-  triC[2] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]),
-                         _mm256_castps_pd(tritmp[11])));  // Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
-
-  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
-
-  int result = ray_triangle_intersect8(kg,
-                                       P,
-                                       dir,
-                                       isect,
-                                       visibility,
-                                       object,
-                                       triA,
-                                       triB,
-                                       triC,
-                                       prim_addr,
-                                       prim_num,
-                                       num_hits,
-                                       max_hits,
-                                       num_hits_in_instance,
-                                       isect_t);
-  return result;
-}
-
-#endif /* __KERNEL_AVX2__ */
-
 /* Special ray intersection routines for subsurface scattering. In that case we
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point.
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 71b176a0a8f..4ac07d86dda 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -326,9 +326,7 @@ ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
   /* Background MIS weights. */
 #  ifdef __BACKGROUND_MIS__
   /* Check if background light exists or if we should skip pdf. */
-  int res_x = kernel_data.integrator.pdf_background_res_x;
-
-  if (!(state->flag & PATH_RAY_MIS_SKIP) && res_x) {
+  if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
     /* multiple importance sampling, get background light pdf for ray
      * direction, and compute weight with respect to BSDF pdf */
     float pdf = background_light_pdf(kg, ray->P, ray->D);
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 04472212d0c..138b90373a6 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "kernel_light_background.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Light Sample result */
@@ -33,500 +35,6 @@ typedef struct LightSample {
   LightType type; /* type of light */
 } LightSample;
 
-/* Area light sampling */
-
-/* Uses the following paper:
- *
- * Carlos Urena et al.
- * An Area-Preserving Parametrization for Spherical Rectangles.
- *
- * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
- *
- * Note: light_p is modified when sample_coord is true.
- */
-ccl_device_inline float rect_light_sample(float3 P,
-                                          float3 *light_p,
-                                          float3 axisu,
-                                          float3 axisv,
-                                          float randu,
-                                          float randv,
-                                          bool sample_coord)
-{
-  /* In our name system we're using P for the center,
-   * which is o in the paper.
-   */
-
-  float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
-  float axisu_len, axisv_len;
-  /* Compute local reference system R. */
-  float3 x = normalize_len(axisu, &axisu_len);
-  float3 y = normalize_len(axisv, &axisv_len);
-  float3 z = cross(x, y);
-  /* Compute rectangle coords in local reference system. */
-  float3 dir = corner - P;
-  float z0 = dot(dir, z);
-  /* Flip 'z' to make it point against Q. */
-  if (z0 > 0.0f) {
-    z *= -1.0f;
-    z0 *= -1.0f;
-  }
-  float x0 = dot(dir, x);
-  float y0 = dot(dir, y);
-  float x1 = x0 + axisu_len;
-  float y1 = y0 + axisv_len;
-  /* Compute internal angles (gamma_i). */
-  float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1);
-  float4 nz = make_float4(y0, x1, y1, x0) * diff;
-  nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz);
-  float g0 = safe_acosf(-nz.x * nz.y);
-  float g1 = safe_acosf(-nz.y * nz.z);
-  float g2 = safe_acosf(-nz.z * nz.w);
-  float g3 = safe_acosf(-nz.w * nz.x);
-  /* Compute predefined constants. */
-  float b0 = nz.x;
-  float b1 = nz.z;
-  float b0sq = b0 * b0;
-  float k = M_2PI_F - g2 - g3;
-  /* Compute solid angle from internal angles. */
-  float S = g0 + g1 - k;
-
-  if (sample_coord) {
-    /* Compute cu. */
-    float au = randu * S + k;
-    float fu = (cosf(au) * b0 - b1) / sinf(au);
-    float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
-    cu = clamp(cu, -1.0f, 1.0f);
-    /* Compute xu. */
-    float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
-    xu = clamp(xu, x0, x1);
-    /* Compute yv. */
-    float z0sq = z0 * z0;
-    float y0sq = y0 * y0;
-    float y1sq = y1 * y1;
-    float d = sqrtf(xu * xu + z0sq);
-    float h0 = y0 / sqrtf(d * d + y0sq);
-    float h1 = y1 / sqrtf(d * d + y1sq);
-    float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
-    float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
-
-    /* Transform (xu, yv, z0) to world coords. */
-    *light_p = P + xu * x + yv * y + z0 * z;
-  }
-
-  /* return pdf */
-  if (S != 0.0f)
-    return 1.0f / S;
-  else
-    return 0.0f;
-}
-
-ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv)
-{
-  to_unit_disk(&randu, &randv);
-  return ru * randu + rv * randv;
-}
-
-ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
-{
-  float3 ru, rv;
-
-  make_orthonormals(v, &ru, &rv);
-
-  return ellipse_sample(ru, rv, randu, randv);
-}
-
-ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
-{
-  return normalize(D + disk_light_sample(D, randu, randv) * radius);
-}
-
-ccl_device float3
-sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
-{
-  return disk_light_sample(normalize(P - center), randu, randv) * radius;
-}
-
-ccl_device float spot_light_attenuation(float3 dir,
-                                        float spot_angle,
-                                        float spot_smooth,
-                                        LightSample *ls)
-{
-  float3 I = ls->Ng;
-
-  float attenuation = dot(dir, I);
-
-  if (attenuation <= spot_angle) {
-    attenuation = 0.0f;
-  }
-  else {
-    float t = attenuation - spot_angle;
-
-    if (t < spot_smooth && spot_smooth != 0.0f)
-      attenuation *= smoothstepf(t / spot_smooth);
-  }
-
-  return attenuation;
-}
-
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
-{
-  float cos_pi = dot(Ng, I);
-
-  if (cos_pi <= 0.0f)
-    return 0.0f;
-
-  return t * t / cos_pi;
-}
-
-/* Background Light */
-
-#ifdef __BACKGROUND_MIS__
-
-ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
-{
-  /* for the following, the CDF values are actually a pair of floats, with the
-   * function value as X and the actual CDF as Y.  The last entry's function
-   * value is the CDF total. */
-  int res_x = kernel_data.integrator.pdf_background_res_x;
-  int res_y = kernel_data.integrator.pdf_background_res_y;
-  int cdf_width = res_x + 1;
-
-  /* this is basically std::lower_bound as used by pbrt */
-  int first = 0;
-  int count = res_y;
-
-  while (count > 0) {
-    int step = count >> 1;
-    int middle = first + step;
-
-    if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) {
-      first = middle + 1;
-      count -= step + 1;
-    }
-    else
-      count = step;
-  }
-
-  int index_v = max(0, first - 1);
-  kernel_assert(index_v >= 0 && index_v < res_y);
-
-  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
-  float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1);
-  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
-
-  /* importance-sampled V direction */
-  float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv);
-  float v = (index_v + dv) / res_y;
-
-  /* this is basically std::lower_bound as used by pbrt */
-  first = 0;
-  count = res_x;
-  while (count > 0) {
-    int step = count >> 1;
-    int middle = first + step;
-
-    if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y <
-        randu) {
-      first = middle + 1;
-      count -= step + 1;
-    }
-    else
-      count = step;
-  }
-
-  int index_u = max(0, first - 1);
-  kernel_assert(index_u >= 0 && index_u < res_x);
-
-  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                  index_v * cdf_width + index_u);
-  float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + index_u + 1);
-  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + res_x);
-
-  /* importance-sampled U direction */
-  float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu);
-  float u = (index_u + du) / res_x;
-
-  /* compute pdf */
-  float sin_theta = sinf(M_PI_F * v);
-  float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
-
-  if (sin_theta == 0.0f || denom == 0.0f)
-    *pdf = 0.0f;
-  else
-    *pdf = (cdf_u.x * cdf_v.x) / denom;
-
-  /* compute direction */
-  return equirectangular_to_direction(u, v);
-}
-
-/* TODO(sergey): Same as above, after the release we should consider using
- * 'noinline' for all devices.
- */
-ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
-{
-  float2 uv = direction_to_equirectangular(direction);
-  int res_x = kernel_data.integrator.pdf_background_res_x;
-  int res_y = kernel_data.integrator.pdf_background_res_y;
-  int cdf_width = res_x + 1;
-
-  float sin_theta = sinf(uv.y * M_PI_F);
-
-  if (sin_theta == 0.0f)
-    return 0.0f;
-
-  int index_u = clamp(float_to_int(uv.x * res_x), 0, res_x - 1);
-  int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1);
-
-  /* pdfs in V direction */
-  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + res_x);
-  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
-
-  float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
-
-  if (denom == 0.0f)
-    return 0.0f;
-
-  /* pdfs in U direction */
-  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                  index_v * cdf_width + index_u);
-  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
-
-  return (cdf_u.x * cdf_v.x) / denom;
-}
-
-ccl_device_inline bool background_portal_data_fetch_and_check_side(
-    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
-{
-  int portal = kernel_data.integrator.portal_offset + index;
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
-
-  *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-  *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-  /* Check whether portal is on the right side. */
-  if (dot(*dir, P - *lightpos) > 1e-4f)
-    return true;
-
-  return false;
-}
-
-ccl_device_inline float background_portal_pdf(
-    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
-{
-  float portal_pdf = 0.0f;
-
-  int num_possible = 0;
-  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
-    if (p == ignore_portal)
-      continue;
-
-    float3 lightpos, dir;
-    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
-      continue;
-
-    /* There's a portal that could be sampled from this position. */
-    if (is_possible) {
-      *is_possible = true;
-    }
-    num_possible++;
-
-    int portal = kernel_data.integrator.portal_offset + p;
-    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
-    float3 axisu = make_float3(
-        klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-    float3 axisv = make_float3(
-        klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-    bool is_round = (klight->area.invarea < 0.0f);
-
-    if (!ray_quad_intersect(P,
-                            direction,
-                            1e-4f,
-                            FLT_MAX,
-                            lightpos,
-                            axisu,
-                            axisv,
-                            dir,
-                            NULL,
-                            NULL,
-                            NULL,
-                            NULL,
-                            is_round))
-      continue;
-
-    if (is_round) {
-      float t;
-      float3 D = normalize_len(lightpos - P, &t);
-      portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
-    }
-    else {
-      portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
-    }
-  }
-
-  if (ignore_portal >= 0) {
-    /* We have skipped a portal that could be sampled as well. */
-    num_possible++;
-  }
-
-  return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
-}
-
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
-{
-  int num_possible_portals = 0;
-  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
-    float3 lightpos, dir;
-    if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
-      num_possible_portals++;
-  }
-  return num_possible_portals;
-}
-
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
-                                           float3 P,
-                                           float randu,
-                                           float randv,
-                                           int num_possible,
-                                           int *sampled_portal,
-                                           float *pdf)
-{
-  /* Pick a portal, then re-normalize randv. */
-  randv *= num_possible;
-  int portal = (int)randv;
-  randv -= portal;
-
-  /* TODO(sergey): Some smarter way of finding portal to sample
-   * is welcome.
-   */
-  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
-    /* Search for the sampled portal. */
-    float3 lightpos, dir;
-    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
-      continue;
-
-    if (portal == 0) {
-      /* p is the portal to be sampled. */
-      int portal = kernel_data.integrator.portal_offset + p;
-      const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
-      float3 axisu = make_float3(
-          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-      float3 axisv = make_float3(
-          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-      bool is_round = (klight->area.invarea < 0.0f);
-
-      float3 D;
-      if (is_round) {
-        lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
-        float t;
-        D = normalize_len(lightpos - P, &t);
-        *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
-      }
-      else {
-        *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true);
-        D = normalize(lightpos - P);
-      }
-
-      *pdf /= num_possible;
-      *sampled_portal = p;
-      return D;
-    }
-
-    portal--;
-  }
-
-  return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
-{
-  /* Probability of sampling portals instead of the map. */
-  float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
-
-  /* Check if there are portals in the scene which we can sample. */
-  if (portal_sampling_pdf > 0.0f) {
-    int num_portals = background_num_possible_portals(kg, P);
-    if (num_portals > 0) {
-      if (portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) {
-        if (portal_sampling_pdf < 1.0f) {
-          randu /= portal_sampling_pdf;
-        }
-        int portal;
-        float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
-        if (num_portals > 1) {
-          /* Ignore the chosen portal, its pdf is already included. */
-          *pdf += background_portal_pdf(kg, P, D, portal, NULL);
-        }
-        /* We could also have sampled the map, so combine with MIS. */
-        if (portal_sampling_pdf < 1.0f) {
-          float cdf_pdf = background_map_pdf(kg, D);
-          *pdf = (portal_sampling_pdf * (*pdf) + (1.0f - portal_sampling_pdf) * cdf_pdf);
-        }
-        return D;
-      }
-      else {
-        /* Sample map, but with nonzero portal_sampling_pdf for MIS. */
-        randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
-      }
-    }
-    else {
-      /* We can't sample a portal.
-       * Check if we can sample the map instead.
-       */
-      if (portal_sampling_pdf == 1.0f) {
-        /* Use uniform as a fallback if we can't sample the map. */
-        *pdf = 1.0f / M_4PI_F;
-        return sample_uniform_sphere(randu, randv);
-      }
-      else {
-        portal_sampling_pdf = 0.0f;
-      }
-    }
-  }
-
-  float3 D = background_map_sample(kg, randu, randv, pdf);
-  /* Use MIS if portals could be sampled as well. */
-  if (portal_sampling_pdf > 0.0f) {
-    float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL);
-    *pdf = (portal_sampling_pdf * portal_pdf + (1.0f - portal_sampling_pdf) * (*pdf));
-  }
-  return D;
-}
-
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
-{
-  /* Probability of sampling portals instead of the map. */
-  float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
-
-  float portal_pdf = 0.0f, map_pdf = 0.0f;
-  if (portal_sampling_pdf > 0.0f) {
-    /* Evaluate PDF of sampling this direction by portal sampling. */
-    bool is_possible = false;
-    portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible) * portal_sampling_pdf;
-    if (!is_possible) {
-      /* Portal sampling is not possible here because all portals point to the wrong side.
-       * If map sampling is possible, it would be used instead,
-       * otherwise fallback sampling is used. */
-      if (portal_sampling_pdf == 1.0f) {
-        return kernel_data.integrator.pdf_lights / M_4PI_F;
-      }
-      else {
-        /* Force map sampling. */
-        portal_sampling_pdf = 0.0f;
-      }
-    }
-  }
-  if (portal_sampling_pdf < 1.0f) {
-    /* Evaluate PDF of sampling this direction by map sampling. */
-    map_pdf = background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf);
-  }
-  return (portal_pdf + map_pdf) * kernel_data.integrator.pdf_lights;
-}
-#endif
-
 /* Regular Light */
 
 ccl_device_inline bool lamp_light_sample(
@@ -594,7 +102,7 @@ ccl_device_inline bool lamp_light_sample(
         /* spot light attenuation */
         float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
         ls->eval_fac *= spot_light_attenuation(
-            dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls);
+            dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
         if (ls->eval_fac == 0.0f) {
           return false;
         }
@@ -732,7 +240,7 @@ ccl_device bool lamp_light_eval(
       /* spot light attenuation */
       float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
       ls->eval_fac *= spot_light_attenuation(
-          dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls);
+          dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
 
       if (ls->eval_fac == 0.0f)
         return false;
@@ -805,20 +313,18 @@ ccl_device_inline bool triangle_world_space_vertices(
     triangle_vertices(kg, prim, V);
   }
 
-#ifdef __INSTANCING__
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#  ifdef __OBJECT_MOTION__
+#ifdef __OBJECT_MOTION__
     float object_time = (time >= 0.0f) ? time : 0.5f;
     Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL);
-#  else
+#else
     Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#  endif
+#endif
     V[0] = transform_point(&tfm, V[0]);
     V[1] = transform_point(&tfm, V[1]);
     V[2] = transform_point(&tfm, V[2]);
     has_motion = true;
   }
-#endif
   return has_motion;
 }
 
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
new file mode 100644
index 00000000000..30e336f0f80
--- /dev/null
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -0,0 +1,448 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_light_common.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Background Light */
+
+#ifdef __BACKGROUND_MIS__
+
+ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+{
+  /* for the following, the CDF values are actually a pair of floats, with the
+   * function value as X and the actual CDF as Y.  The last entry's function
+   * value is the CDF total. */
+  int res_x = kernel_data.background.map_res_x;
+  int res_y = kernel_data.background.map_res_y;
+  int cdf_width = res_x + 1;
+
+  /* this is basically std::lower_bound as used by pbrt */
+  int first = 0;
+  int count = res_y;
+
+  while (count > 0) {
+    int step = count >> 1;
+    int middle = first + step;
+
+    if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) {
+      first = middle + 1;
+      count -= step + 1;
+    }
+    else
+      count = step;
+  }
+
+  int index_v = max(0, first - 1);
+  kernel_assert(index_v >= 0 && index_v < res_y);
+
+  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
+  float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1);
+  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
+
+  /* importance-sampled V direction */
+  float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv);
+  float v = (index_v + dv) / res_y;
+
+  /* this is basically std::lower_bound as used by pbrt */
+  first = 0;
+  count = res_x;
+  while (count > 0) {
+    int step = count >> 1;
+    int middle = first + step;
+
+    if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y <
+        randu) {
+      first = middle + 1;
+      count -= step + 1;
+    }
+    else
+      count = step;
+  }
+
+  int index_u = max(0, first - 1);
+  kernel_assert(index_u >= 0 && index_u < res_x);
+
+  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                  index_v * cdf_width + index_u);
+  float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                       index_v * cdf_width + index_u + 1);
+  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                       index_v * cdf_width + res_x);
+
+  /* importance-sampled U direction */
+  float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu);
+  float u = (index_u + du) / res_x;
+
+  /* compute pdf */
+  float sin_theta = sinf(M_PI_F * v);
+  float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
+
+  if (sin_theta == 0.0f || denom == 0.0f)
+    *pdf = 0.0f;
+  else
+    *pdf = (cdf_u.x * cdf_v.x) / denom;
+
+  /* compute direction */
+  return equirectangular_to_direction(u, v);
+}
+
+/* TODO(sergey): Same as above, after the release we should consider using
+ * 'noinline' for all devices.
+ */
+ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+{
+  float2 uv = direction_to_equirectangular(direction);
+  int res_x = kernel_data.background.map_res_x;
+  int res_y = kernel_data.background.map_res_y;
+  int cdf_width = res_x + 1;
+
+  float sin_theta = sinf(uv.y * M_PI_F);
+
+  if (sin_theta == 0.0f)
+    return 0.0f;
+
+  int index_u = clamp(float_to_int(uv.x * res_x), 0, res_x - 1);
+  int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1);
+
+  /* pdfs in V direction */
+  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                       index_v * cdf_width + res_x);
+  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
+
+  float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
+
+  if (denom == 0.0f)
+    return 0.0f;
+
+  /* pdfs in U direction */
+  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                  index_v * cdf_width + index_u);
+  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
+
+  return (cdf_u.x * cdf_v.x) / denom;
+}
+
+ccl_device_inline bool background_portal_data_fetch_and_check_side(
+    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+{
+  int portal = kernel_data.background.portal_offset + index;
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+
+  *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+  *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
+
+  /* Check whether portal is on the right side. */
+  if (dot(*dir, P - *lightpos) > 1e-4f)
+    return true;
+
+  return false;
+}
+
+ccl_device_inline float background_portal_pdf(
+    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+{
+  float portal_pdf = 0.0f;
+
+  int num_possible = 0;
+  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+    if (p == ignore_portal)
+      continue;
+
+    float3 lightpos, dir;
+    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+      continue;
+
+    /* There's a portal that could be sampled from this position. */
+    if (is_possible) {
+      *is_possible = true;
+    }
+    num_possible++;
+
+    int portal = kernel_data.background.portal_offset + p;
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+    float3 axisu = make_float3(
+        klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+    float3 axisv = make_float3(
+        klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+    bool is_round = (klight->area.invarea < 0.0f);
+
+    if (!ray_quad_intersect(P,
+                            direction,
+                            1e-4f,
+                            FLT_MAX,
+                            lightpos,
+                            axisu,
+                            axisv,
+                            dir,
+                            NULL,
+                            NULL,
+                            NULL,
+                            NULL,
+                            is_round))
+      continue;
+
+    if (is_round) {
+      float t;
+      float3 D = normalize_len(lightpos - P, &t);
+      portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+    }
+    else {
+      portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+    }
+  }
+
+  if (ignore_portal >= 0) {
+    /* We have skipped a portal that could be sampled as well. */
+    num_possible++;
+  }
+
+  return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
+}
+
+ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+{
+  int num_possible_portals = 0;
+  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+    float3 lightpos, dir;
+    if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+      num_possible_portals++;
+  }
+  return num_possible_portals;
+}
+
+ccl_device float3 background_portal_sample(KernelGlobals *kg,
+                                           float3 P,
+                                           float randu,
+                                           float randv,
+                                           int num_possible,
+                                           int *sampled_portal,
+                                           float *pdf)
+{
+  /* Pick a portal, then re-normalize randv. */
+  randv *= num_possible;
+  int portal = (int)randv;
+  randv -= portal;
+
+  /* TODO(sergey): Some smarter way of finding portal to sample
+   * is welcome.
+   */
+  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+    /* Search for the sampled portal. */
+    float3 lightpos, dir;
+    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+      continue;
+
+    if (portal == 0) {
+      /* p is the portal to be sampled. */
+      int portal = kernel_data.background.portal_offset + p;
+      const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+      float3 axisu = make_float3(
+          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+      float3 axisv = make_float3(
+          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      bool is_round = (klight->area.invarea < 0.0f);
+
+      float3 D;
+      if (is_round) {
+        lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
+        float t;
+        D = normalize_len(lightpos - P, &t);
+        *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+      }
+      else {
+        *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true);
+        D = normalize(lightpos - P);
+      }
+
+      *pdf /= num_possible;
+      *sampled_portal = p;
+      return D;
+    }
+
+    portal--;
+  }
+
+  return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+                                               float randu,
+                                               float randv,
+                                               float *pdf)
+{
+  float3 D;
+  const float3 N = float4_to_float3(kernel_data.background.sun);
+  const float angle = kernel_data.background.sun.w;
+  sample_uniform_cone(N, angle, randu, randv, &D, pdf);
+  return D;
+}
+
+ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+{
+  const float3 N = float4_to_float3(kernel_data.background.sun);
+  const float angle = kernel_data.background.sun.w;
+  return pdf_uniform_cone(N, D, angle);
+}
+
+ccl_device_inline float3
+background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+{
+  float portal_method_pdf = kernel_data.background.portal_weight;
+  float sun_method_pdf = kernel_data.background.sun_weight;
+  float map_method_pdf = kernel_data.background.map_weight;
+
+  int num_portals = 0;
+  if (portal_method_pdf > 0.0f) {
+    /* Check if there are portals in the scene which we can sample. */
+    num_portals = background_num_possible_portals(kg, P);
+    if (num_portals == 0) {
+      portal_method_pdf = 0.0f;
+    }
+  }
+
+  float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf);
+  if (pdf_fac == 0.0f) {
+    /* Use uniform as a fallback if we can't use any strategy. */
+    *pdf = 1.0f / M_4PI_F;
+    return sample_uniform_sphere(randu, randv);
+  }
+
+  pdf_fac = 1.0f / pdf_fac;
+  portal_method_pdf *= pdf_fac;
+  sun_method_pdf *= pdf_fac;
+  map_method_pdf *= pdf_fac;
+
+  /* We have 100% in total and split it between the three categories.
+   * Therefore, we pick portals if randu is between 0 and portal_method_pdf,
+   * sun if randu is between portal_method_pdf and (portal_method_pdf + sun_method_pdf)
+   * and map if randu is between (portal_method_pdf + sun_method_pdf) and 1. */
+  float sun_method_cdf = portal_method_pdf + sun_method_pdf;
+
+  int method = 0;
+  float3 D;
+  if (randu < portal_method_pdf) {
+    method = 0;
+    /* Rescale randu. */
+    if (portal_method_pdf != 1.0f) {
+      randu /= portal_method_pdf;
+    }
+
+    /* Sample a portal. */
+    int portal;
+    D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
+    if (num_portals > 1) {
+      /* Ignore the chosen portal, its pdf is already included. */
+      *pdf += background_portal_pdf(kg, P, D, portal, NULL);
+    }
+
+    /* Skip MIS if this is the only method. */
+    if (portal_method_pdf == 1.0f) {
+      return D;
+    }
+    *pdf *= portal_method_pdf;
+  }
+  else if (randu < sun_method_cdf) {
+    method = 1;
+    /* Rescale randu. */
+    if (sun_method_pdf != 1.0f) {
+      randu = (randu - portal_method_pdf) / sun_method_pdf;
+    }
+
+    D = background_sun_sample(kg, randu, randv, pdf);
+
+    /* Skip MIS if this is the only method. */
+    if (sun_method_pdf == 1.0f) {
+      return D;
+    }
+    *pdf *= sun_method_pdf;
+  }
+  else {
+    method = 2;
+    /* Rescale randu. */
+    if (map_method_pdf != 1.0f) {
+      randu = (randu - sun_method_cdf) / map_method_pdf;
+    }
+
+    D = background_map_sample(kg, randu, randv, pdf);
+
+    /* Skip MIS if this is the only method. */
+    if (map_method_pdf == 1.0f) {
+      return D;
+    }
+    *pdf *= map_method_pdf;
+  }
+
+  /* MIS weighting. */
+  if (method != 0 && portal_method_pdf != 0.0f) {
+    *pdf += portal_method_pdf * background_portal_pdf(kg, P, D, -1, NULL);
+  }
+  if (method != 1 && sun_method_pdf != 0.0f) {
+    *pdf += sun_method_pdf * background_sun_pdf(kg, D);
+  }
+  if (method != 2 && map_method_pdf != 0.0f) {
+    *pdf += map_method_pdf * background_map_pdf(kg, D);
+  }
+  return D;
+}
+
+ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+{
+  float portal_method_pdf = kernel_data.background.portal_weight;
+  float sun_method_pdf = kernel_data.background.sun_weight;
+  float map_method_pdf = kernel_data.background.map_weight;
+
+  float portal_pdf = 0.0f;
+  /* Portals are a special case here since we need to compute their pdf in order
+   * to find out if we can sample them. */
+  if (portal_method_pdf > 0.0f) {
+    /* Evaluate PDF of sampling this direction by portal sampling. */
+    bool is_possible = false;
+    portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible);
+    if (!is_possible) {
+      /* Portal sampling is not possible here because all portals point to the wrong side.
+       * If other methods can be used instead, do so, otherwise uniform sampling is used as a
+       * fallback. */
+      portal_method_pdf = 0.0f;
+    }
+  }
+
+  float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf);
+  if (pdf_fac == 0.0f) {
+    /* Use uniform as a fallback if we can't use any strategy. */
+    return kernel_data.integrator.pdf_lights / M_4PI_F;
+  }
+
+  pdf_fac = 1.0f / pdf_fac;
+  portal_method_pdf *= pdf_fac;
+  sun_method_pdf *= pdf_fac;
+  map_method_pdf *= pdf_fac;
+
+  float pdf = portal_pdf * portal_method_pdf;
+  if (sun_method_pdf != 0.0f) {
+    pdf += background_sun_pdf(kg, direction) * sun_method_pdf;
+  }
+  if (map_method_pdf != 0.0f) {
+    pdf += background_map_pdf(kg, direction) * map_method_pdf;
+  }
+
+  return pdf * kernel_data.integrator.pdf_lights;
+}
+
+#endif
+
+CCL_NAMESPACE_END
+\ No newline at end of file
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
new file mode 100644
index 00000000000..39503a4b479
--- /dev/null
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Area light sampling */
+
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ *
+ * Note: light_p is modified when sample_coord is true.
+ */
+ccl_device_inline float rect_light_sample(float3 P,
+                                          float3 *light_p,
+                                          float3 axisu,
+                                          float3 axisv,
+                                          float randu,
+                                          float randv,
+                                          bool sample_coord)
+{
+  /* In our name system we're using P for the center,
+   * which is o in the paper.
+   */
+
+  float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
+  float axisu_len, axisv_len;
+  /* Compute local reference system R. */
+  float3 x = normalize_len(axisu, &axisu_len);
+  float3 y = normalize_len(axisv, &axisv_len);
+  float3 z = cross(x, y);
+  /* Compute rectangle coords in local reference system. */
+  float3 dir = corner - P;
+  float z0 = dot(dir, z);
+  /* Flip 'z' to make it point against Q. */
+  if (z0 > 0.0f) {
+    z *= -1.0f;
+    z0 *= -1.0f;
+  }
+  float x0 = dot(dir, x);
+  float y0 = dot(dir, y);
+  float x1 = x0 + axisu_len;
+  float y1 = y0 + axisv_len;
+  /* Compute internal angles (gamma_i). */
+  float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1);
+  float4 nz = make_float4(y0, x1, y1, x0) * diff;
+  nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz);
+  float g0 = safe_acosf(-nz.x * nz.y);
+  float g1 = safe_acosf(-nz.y * nz.z);
+  float g2 = safe_acosf(-nz.z * nz.w);
+  float g3 = safe_acosf(-nz.w * nz.x);
+  /* Compute predefined constants. */
+  float b0 = nz.x;
+  float b1 = nz.z;
+  float b0sq = b0 * b0;
+  float k = M_2PI_F - g2 - g3;
+  /* Compute solid angle from internal angles. */
+  float S = g0 + g1 - k;
+
+  if (sample_coord) {
+    /* Compute cu. */
+    float au = randu * S + k;
+    float fu = (cosf(au) * b0 - b1) / sinf(au);
+    float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+    cu = clamp(cu, -1.0f, 1.0f);
+    /* Compute xu. */
+    float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
+    xu = clamp(xu, x0, x1);
+    /* Compute yv. */
+    float z0sq = z0 * z0;
+    float y0sq = y0 * y0;
+    float y1sq = y1 * y1;
+    float d = sqrtf(xu * xu + z0sq);
+    float h0 = y0 / sqrtf(d * d + y0sq);
+    float h1 = y1 / sqrtf(d * d + y1sq);
+    float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+    float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+    /* Transform (xu, yv, z0) to world coords. */
+    *light_p = P + xu * x + yv * y + z0 * z;
+  }
+
+  /* return pdf */
+  if (S != 0.0f)
+    return 1.0f / S;
+  else
+    return 0.0f;
+}
+
+ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv)
+{
+  to_unit_disk(&randu, &randv);
+  return ru * randu + rv * randv;
+}
+
+ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
+{
+  float3 ru, rv;
+
+  make_orthonormals(v, &ru, &rv);
+
+  return ellipse_sample(ru, rv, randu, randv);
+}
+
+ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
+{
+  return normalize(D + disk_light_sample(D, randu, randv) * radius);
+}
+
+ccl_device float3
+sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
+{
+  return disk_light_sample(normalize(P - center), randu, randv) * radius;
+}
+
+ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, float3 N)
+{
+  float attenuation = dot(dir, N);
+
+  if (attenuation <= spot_angle) {
+    attenuation = 0.0f;
+  }
+  else {
+    float t = attenuation - spot_angle;
+
+    if (t < spot_smooth && spot_smooth != 0.0f)
+      attenuation *= smoothstepf(t / spot_smooth);
+  }
+
+  return attenuation;
+}
+
+ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+{
+  float cos_pi = dot(Ng, I);
+
+  if (cos_pi <= 0.0f)
+    return 0.0f;
+
+  return t * t / cos_pi;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index 5c776e06547..0edcc1a5a14 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -98,6 +98,16 @@ ccl_device_inline void sample_uniform_cone(
   *pdf = M_1_2PI_F / (1.0f - zMin);
 }
 
+ccl_device_inline float pdf_uniform_cone(const float3 N, float3 D, float angle)
+{
+  float zMin = cosf(angle);
+  float z = dot(N, D);
+  if (z > zMin) {
+    return M_1_2PI_F / (1.0f - zMin);
+  }
+  return 0.0f;
+}
+
 /* sample uniform point on the surface of a sphere */
 ccl_device float3 sample_uniform_sphere(float u1, float u2)
 {
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 9700aaba80f..3d9f787f267 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -63,10 +63,8 @@ ccl_device_noinline
 {
   PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
 
-#ifdef __INSTANCING__
   sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
                                                 isect->object;
-#endif
   sd->lamp = LAMP_NONE;
 
   sd->type = isect->type;
@@ -82,18 +80,13 @@ ccl_device_noinline
   sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
   sd->ray_length = isect->t;
 
-#ifdef __UV__
   sd->u = isect->u;
   sd->v = isect->v;
-#endif
 
 #ifdef __HAIR__
   if (sd->type & PRIMITIVE_ALL_CURVE) {
     /* curve */
-    float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-
-    sd->shader = __float_as_int(curvedata.z);
-    sd->P = curve_refine(kg, sd, isect, ray);
+    curve_shader_setup(kg, sd, isect, ray);
   }
   else
 #endif
@@ -125,17 +118,15 @@ ccl_device_noinline
 
   sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 
-#ifdef __INSTANCING__
   if (isect->object != OBJECT_NONE) {
     /* instance transform */
     object_normal_transform_auto(kg, sd, &sd->N);
     object_normal_transform_auto(kg, sd, &sd->Ng);
-#  ifdef __DPDU__
+#ifdef __DPDU__
     object_dir_transform_auto(kg, sd, &sd->dPdu);
     object_dir_transform_auto(kg, sd, &sd->dPdv);
-#  endif
-  }
 #endif
+  }
 
   /* backfacing test */
   bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
@@ -185,10 +176,8 @@ ccl_device_inline
   sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
   sd->type = isect->type;
 
-#  ifdef __UV__
   sd->u = isect->u;
   sd->v = isect->v;
-#  endif
 
   /* fetch triangle data */
   if (sd->type == PRIMITIVE_TRIANGLE) {
@@ -215,17 +204,15 @@ ccl_device_inline
 
   sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 
-#  ifdef __INSTANCING__
   if (isect->object != OBJECT_NONE) {
     /* instance transform */
     object_normal_transform_auto(kg, sd, &sd->N);
     object_normal_transform_auto(kg, sd, &sd->Ng);
-#    ifdef __DPDU__
+#  ifdef __DPDU__
     object_dir_transform_auto(kg, sd, &sd->dPdu);
     object_dir_transform_auto(kg, sd, &sd->dPdv);
-#    endif
-  }
 #  endif
+  }
 
   /* backfacing test */
   if (backfacing) {
@@ -284,17 +271,13 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
   else
     sd->type = PRIMITIVE_NONE;
 
-    /* primitive */
-#ifdef __INSTANCING__
+  /* primitive */
   sd->object = object;
-#endif
   sd->lamp = LAMP_NONE;
   /* currently no access to bvh prim index for strand sd->prim*/
   sd->prim = prim;
-#ifdef __UV__
   sd->u = u;
   sd->v = v;
-#endif
   sd->time = time;
   sd->ray_length = t;
 
@@ -330,23 +313,19 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
     if (sd->shader & SHADER_SMOOTH_NORMAL) {
       sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
-#ifdef __INSTANCING__
       if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
         object_normal_transform_auto(kg, sd, &sd->N);
       }
-#endif
     }
 
     /* dPdu/dPdv */
 #ifdef __DPDU__
     triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
-#  ifdef __INSTANCING__
     if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       object_dir_transform_auto(kg, sd, &sd->dPdu);
       object_dir_transform_auto(kg, sd, &sd->dPdv);
     }
-#  endif
 #endif
   }
   else {
@@ -432,15 +411,11 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
   sd->time = ray->time;
   sd->ray_length = 0.0f;
 
-#ifdef __INSTANCING__
   sd->object = OBJECT_NONE;
-#endif
   sd->lamp = LAMP_NONE;
   sd->prim = PRIM_NONE;
-#ifdef __UV__
   sd->u = 0.0f;
   sd->v = 0.0f;
-#endif
 
 #ifdef __DPDU__
   /* dPdu/dPdv */
@@ -481,17 +456,13 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
   sd->time = ray->time;
   sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
 
-#  ifdef __INSTANCING__
   sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
-#  endif
   sd->lamp = LAMP_NONE;
   sd->prim = PRIM_NONE;
   sd->type = PRIMITIVE_NONE;
 
-#  ifdef __UV__
   sd->u = 0.0f;
   sd->v = 0.0f;
-#  endif
 
 #  ifdef __DPDU__
   /* dPdu/dPdv */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 0a0cf1bd6c0..fc9cc73a704 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -84,9 +84,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Kernel features */
 #define __SOBOL__
-#define __INSTANCING__
 #define __DPDU__
-#define __UV__
 #define __BACKGROUND__
 #define __CAUSTICS_TRICKS__
 #define __VISIBILITY_FLAG__
@@ -125,9 +123,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Device specific features */
 #ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-#    define __QBVH__
-#  endif
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
@@ -696,27 +691,38 @@ typedef enum PrimitiveType {
   PRIMITIVE_NONE = 0,
   PRIMITIVE_TRIANGLE = (1 << 0),
   PRIMITIVE_MOTION_TRIANGLE = (1 << 1),
-  PRIMITIVE_CURVE = (1 << 2),
-  PRIMITIVE_MOTION_CURVE = (1 << 3),
+  PRIMITIVE_CURVE_THICK = (1 << 2),
+  PRIMITIVE_MOTION_CURVE_THICK = (1 << 3),
+  PRIMITIVE_CURVE_RIBBON = (1 << 4),
+  PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
   /* Lamp primitive is not included below on purpose,
    * since it is no real traceable primitive.
    */
-  PRIMITIVE_LAMP = (1 << 4),
+  PRIMITIVE_LAMP = (1 << 6),
 
   PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
-  PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE | PRIMITIVE_MOTION_CURVE),
-  PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE),
+  PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE_THICK | PRIMITIVE_MOTION_CURVE_THICK |
+                         PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON),
+  PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
+                          PRIMITIVE_MOTION_CURVE_RIBBON),
   PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE),
 
   /* Total number of different traceable primitives.
    * NOTE: This is an actual value, not a bitflag.
    */
-  PRIMITIVE_NUM_TOTAL = 4,
+  PRIMITIVE_NUM_TOTAL = 6,
 } PrimitiveType;
 
 #define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
 #define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
 
+typedef enum CurveShapeType {
+  CURVE_RIBBON = 0,
+  CURVE_THICK = 1,
+
+  CURVE_NUM_SHAPE_TYPES,
+} CurveShapeType;
+
 /* Attributes */
 
 typedef enum AttributePrimitive {
@@ -1291,6 +1297,24 @@ typedef struct KernelBackground {
   float ao_factor;
   float ao_distance;
   float ao_bounces_factor;
+
+  /* portal sampling */
+  float portal_weight;
+  int num_portals;
+  int portal_offset;
+
+  /* sun sampling */
+  float sun_weight;
+  /* xyz store direction, w the angle. float4 instead of float3 is used
+   * to ensure consistent padding/alignment across devices. */
+  float4 sun;
+
+  /* map sampling */
+  float map_weight;
+  int map_res_x;
+  int map_res_y;
+
+  int use_mis;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
@@ -1302,15 +1326,8 @@ typedef struct KernelIntegrator {
   int num_all_lights;
   float pdf_triangles;
   float pdf_lights;
-  int pdf_background_res_x;
-  int pdf_background_res_y;
   float light_inv_rr_threshold;
 
-  /* light portals */
-  float portal_pdf;
-  int num_portals;
-  int portal_offset;
-
   /* bounces */
   int min_bounce;
   int max_bounce;
@@ -1372,7 +1389,7 @@ typedef struct KernelIntegrator {
 
   int max_closures;
 
-  int pad1;
+  int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1380,13 +1397,11 @@ typedef enum KernelBVHLayout {
   BVH_LAYOUT_NONE = 0,
 
   BVH_LAYOUT_BVH2 = (1 << 0),
-  BVH_LAYOUT_BVH4 = (1 << 1),
-  BVH_LAYOUT_BVH8 = (1 << 2),
+  BVH_LAYOUT_EMBREE = (1 << 1),
+  BVH_LAYOUT_OPTIX = (1 << 2),
 
-  BVH_LAYOUT_EMBREE = (1 << 3),
-  BVH_LAYOUT_OPTIX = (1 << 4),
-
-  BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
+  /* Default BVH layout to use for CPU. */
+  BVH_LAYOUT_AUTO = BVH_LAYOUT_EMBREE,
   BVH_LAYOUT_ALL = (unsigned int)(~0u),
 } KernelBVHLayout;
 
@@ -1395,9 +1410,9 @@ typedef struct KernelBVH {
   int root;
   int have_motion;
   int have_curves;
-  int have_instancing;
   int bvh_layout;
   int use_bvh_steps;
+  int curve_subdivisions;
 
   /* Custom BVH */
 #ifdef __KERNEL_OPTIX__
@@ -1415,25 +1430,6 @@ typedef struct KernelBVH {
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
-typedef enum CurveFlag {
-  /* runtime flags */
-  CURVE_KN_BACKFACING = 1,           /* backside of cylinder? */
-  CURVE_KN_ENCLOSEFILTER = 2,        /* don't consider strands surrounding start point? */
-  CURVE_KN_INTERPOLATE = 4,          /* render as a curve? */
-  CURVE_KN_ACCURATE = 8,             /* use accurate intersections test? */
-  CURVE_KN_INTERSECTCORRECTION = 16, /* correct for width after determing closest midpoint? */
-  CURVE_KN_TRUETANGENTGNORMAL = 32,  /* use tangent normal for geometry? */
-  CURVE_KN_RIBBONS = 64,             /* use flat curve ribbons */
-} CurveFlag;
-
-typedef struct KernelCurves {
-  int curveflags;
-  int subdivisions;
-
-  int pad1, pad2;
-} KernelCurves;
-static_assert_align(KernelCurves, 16);
-
 typedef struct KernelTables {
   int beckmann_offset;
   int pad1, pad2, pad3;
@@ -1454,7 +1450,6 @@ typedef struct KernelData {
   KernelBackground background;
   KernelIntegrator integrator;
   KernelBVH bvh;
-  KernelCurves curve;
   KernelTables tables;
   KernelBake bake;
 } KernelData;
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 8829a14ead5..8040bfb7b33 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -64,12 +64,14 @@ CCL_NAMESPACE_BEGIN
 
 /* Memory Copy */
 
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size)
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
 {
-  if (strcmp(name, "__data") == 0)
-    memcpy(&kg->__data, host, size);
-  else
+  if (strcmp(name, "__data") == 0) {
+    kg->__data = *(KernelData *)host;
+  }
+  else {
     assert(0);
+  }
 }
 
 void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
index e03504316ad..c730d952ed4 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
@@ -256,11 +256,9 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
 }
 
 #ifdef __HAIR__
-extern "C" __global__ void __intersection__curve()
+ccl_device_inline void optix_intersection_curve(const uint prim, const uint type)
 {
-  const uint prim = optixGetPrimitiveIndex();
   const uint object = get_object_id<true>();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
   const uint visibility = optixGetPayload_4();
 
   float3 P = optixGetObjectRayOrigin();
@@ -282,14 +280,30 @@ extern "C" __global__ void __intersection__curve()
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (!(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) ?
-          curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type) :
-          cardinal_curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),   // Attribute_0
                             __float_as_int(isect.v));  // Attribute_1
   }
+
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+  const uint prim = optixGetPrimitiveIndex();
+  const uint type = kernel_tex_fetch(__prim_type, prim);
+
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    optix_intersection_curve(prim, type);
+  }
+}
+
+extern "C" __global__ void __intersection__curve_all()
+{
+  const uint prim = optixGetPrimitiveIndex();
+  const uint type = kernel_tex_fetch(__prim_type, prim);
+  optix_intersection_curve(prim, type);
 }
 #endif
 
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index fc0c845fd4f..d7ab778181e 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -36,6 +36,15 @@ set(LIB
 # OSL and LLVM are built without RTTI
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
 
+if(APPLE)
+  # Disable allocation warning on macOS prior to 10.14: the OSLRenderServices
+  # contains member which is 64 bytes aligned (cache inside of OIIO's
+  # unordered_map_concurrent). This is not something what the SDK supportsm, but
+  # since we take care of allocations ourselves is is OK to ignore the
+  # diagnostic message.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-allocation")
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 872a55143cc..7ee467a46dd 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -362,6 +362,9 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
                    id++,
                    closure_bsdf_transparent_params(),
                    closure_bsdf_transparent_prepare);
+
+  register_closure(
+      ss, "microfacet", id++, closure_bsdf_microfacet_params(), closure_bsdf_microfacet_prepare);
   register_closure(ss,
                    "microfacet_ggx",
                    id++,
@@ -508,6 +511,82 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
   return false;
 }
 
+/* Standard Microfacet Closure */
+
+class MicrofacetClosure : public CBSDFClosure {
+ public:
+  MicrofacetBsdf params;
+  ustring distribution;
+  int refract;
+
+  void setup(ShaderData *sd, int path_flag, float3 weight)
+  {
+    static ustring u_ggx("ggx");
+    static ustring u_default("default");
+
+    const int label = (refract) ? LABEL_TRANSMIT : LABEL_REFLECT;
+    if (skip(sd, path_flag, LABEL_GLOSSY | label)) {
+      return;
+    }
+
+    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
+        sd, sizeof(MicrofacetBsdf), weight, &params);
+
+    if (!bsdf) {
+      return;
+    }
+
+    /* GGX */
+    if (distribution == u_ggx || distribution == u_default) {
+      if (!refract) {
+        if (params.alpha_x == params.alpha_y) {
+          /* Isotropic */
+          sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+        }
+        else {
+          /* Anisotropic */
+          sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
+        }
+      }
+      else {
+        sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+      }
+    }
+    /* Beckmann */
+    else {
+      if (!refract) {
+        if (params.alpha_x == params.alpha_y) {
+          /* Isotropic */
+          sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+        }
+        else {
+          /* Anisotropic */
+          sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
+        }
+      }
+      else {
+        sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+      }
+    }
+  }
+};
+
+ClosureParam *closure_bsdf_microfacet_params()
+{
+  static ClosureParam params[] = {CLOSURE_STRING_PARAM(MicrofacetClosure, distribution),
+                                  CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.N),
+                                  CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.T),
+                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_x),
+                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_y),
+                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.ior),
+                                  CLOSURE_INT_PARAM(MicrofacetClosure, refract),
+                                  CLOSURE_STRING_KEYPARAM(MicrofacetClosure, label, "label"),
+                                  CLOSURE_FINISH_PARAM(MicrofacetClosure)};
+
+  return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_prepare, MicrofacetClosure)
+
 /* GGX closures with Fresnel */
 
 class MicrofacetFresnelClosure : public CBSDFClosure {
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index d12afdb80dd..e4058e3a746 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -51,6 +51,7 @@ OSL::ClosureParam *closure_bsdf_transparent_params();
 OSL::ClosureParam *closure_bssrdf_params();
 OSL::ClosureParam *closure_absorption_params();
 OSL::ClosureParam *closure_henyey_greenstein_params();
+OSL::ClosureParam *closure_bsdf_microfacet_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
@@ -70,6 +71,7 @@ void closure_bsdf_transparent_prepare(OSL::RendererServices *, int id, void *dat
 void closure_bssrdf_prepare(OSL::RendererServices *, int id, void *data);
 void closure_absorption_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl
index 4def237a2e0..08bc8f85120 100644
--- a/intern/cycles/kernel/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/shaders/node_sky_texture.osl
@@ -44,13 +44,13 @@ float sky_perez_function(float lam[9], float theta, float gamma)
          (1.0 + lam[2] * exp(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-color sky_radiance_old(normal dir,
-                       float sunphi,
-                       float suntheta,
-                       color radiance,
-                       float config_x[9],
-                       float config_y[9],
-                       float config_z[9])
+color sky_radiance_preetham(normal dir,
+                            float sunphi,
+                            float suntheta,
+                            color radiance,
+                            float config_x[9],
+                            float config_y[9],
+                            float config_z[9])
 {
   /* convert vector to spherical coordinates */
   vector spherical = sky_spherical_coordinates(dir);
@@ -88,13 +88,13 @@ float sky_radiance_internal(float config[9], float theta, float gamma)
          (config[2] + config[3] * expM + config[5] * rayM + config[6] * mieM + config[7] * zenith);
 }
 
-color sky_radiance_new(normal dir,
-                       float sunphi,
-                       float suntheta,
-                       color radiance,
-                       float config_x[9],
-                       float config_y[9],
-                       float config_z[9])
+color sky_radiance_hosek(normal dir,
+                         float sunphi,
+                         float suntheta,
+                         color radiance,
+                         float config_x[9],
+                         float config_y[9],
+                         float config_z[9])
 {
   /* convert vector to spherical coordinates */
   vector spherical = sky_spherical_coordinates(dir);
@@ -116,16 +116,103 @@ color sky_radiance_new(normal dir,
   return xyz_to_rgb(x, y, z) * (M_2PI / 683);
 }
 
+/* Nishita improved */
+vector geographical_to_direction(float lat, float lon)
+{
+  return vector(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
+}
+
+color sky_radiance_nishita(vector dir, float nishita_data[9], string filename)
+{
+  /* definitions */
+  float sun_elevation = nishita_data[6];
+  float sun_rotation = nishita_data[7];
+  float angular_diameter = nishita_data[8];
+  int sun_disc = angular_diameter > 0;
+  float alpha = 1.0;
+  color xyz;
+  /* convert dir to spherical coordinates */
+  vector direction = sky_spherical_coordinates(dir);
+
+  /* render above the horizon */
+  if (dir[2] >= 0.0) {
+    /* definitions */
+    vector sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2);
+    float sun_dir_angle = acos(dot(dir, sun_dir));
+    float half_angular = angular_diameter / 2.0;
+    float dir_elevation = M_PI_2 - direction[0];
+
+    /* if ray inside sun disc render it, otherwise render sky */
+    if (sun_dir_angle < half_angular && sun_disc == 1) {
+      /* get 3 pixels data */
+      color pixel_bottom = color(nishita_data[0], nishita_data[1], nishita_data[2]);
+      color pixel_top = color(nishita_data[3], nishita_data[4], nishita_data[5]);
+      float y;
+
+      /* sun interpolation */
+      if (sun_elevation - half_angular > 0.0) {
+        if ((sun_elevation + half_angular) > 0.0) {
+          y = ((dir_elevation - sun_elevation) / angular_diameter) + 0.5;
+          xyz = mix(pixel_bottom, pixel_top, y);
+        }
+      }
+      else {
+        if (sun_elevation + half_angular > 0.0) {
+          y = dir_elevation / (sun_elevation + half_angular);
+          xyz = mix(pixel_bottom, pixel_top, y);
+        }
+      }
+      /* limb darkening, coefficient is 0.6f */
+      float angle_fraction = sun_dir_angle / half_angular;
+      float limb_darkening = (1.0 - 0.6 * (1.0 - sqrt(1.0 - angle_fraction * angle_fraction)));
+      xyz *= limb_darkening;
+    }
+    /* sky */
+    else {
+      /* sky interpolation */
+      float x = (direction[1] + M_PI + sun_rotation) / M_2PI;
+      float y = 1.0 - (dir_elevation / M_PI_2);
+      if (x > 1.0) {
+        x = x - 1.0;
+      }
+      xyz = (color)texture(filename, x, y, "wrap", "clamp", "interp", "linear", "alpha", alpha);
+    }
+  }
+  /* ground */
+  else {
+    if (dir[2] < -0.4) {
+      xyz = color(0, 0, 0);
+    }
+    else {
+      /* black ground fade */
+      float mul = pow(1.0 + dir[2] * 2.5, 3.0);
+      /* interpolation */
+      float x = (direction[1] + M_PI + sun_rotation) / M_2PI;
+      float y = 1.5;
+      if (x > 1.0) {
+        x = x - 1.0;
+      }
+      xyz = (color)texture(
+                filename, x, y, "wrap", "periodic", "interp", "linear", "alpha", alpha) *
+            mul;
+    }
+  }
+  /* convert to RGB and adjust strength */
+  return xyz_to_rgb(xyz[0], xyz[1], xyz[2]) * 120000.0;
+}
+
 shader node_sky_texture(int use_mapping = 0,
                         matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                         vector Vector = P,
                         string type = "hosek_wilkie",
                         float theta = 0.0,
                         float phi = 0.0,
+                        string filename = "",
                         color radiance = color(0.0, 0.0, 0.0),
                         float config_x[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
                         float config_y[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
                         float config_z[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                        float nishita_data[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
                         output color Color = color(0.0, 0.0, 0.0))
 {
   vector p = Vector;
@@ -133,8 +220,10 @@ shader node_sky_texture(int use_mapping = 0,
   if (use_mapping)
     p = transform(mapping, p);
 
+  if (type == "nishita_improved")
+    Color = sky_radiance_nishita(p, nishita_data, filename);
   if (type == "hosek_wilkie")
-    Color = sky_radiance_new(p, phi, theta, radiance, config_x, config_y, config_z);
-  else
-    Color = sky_radiance_old(p, phi, theta, radiance, config_x, config_y, config_z);
+    Color = sky_radiance_hosek(p, phi, theta, radiance, config_x, config_y, config_z);
+  if (type == "preetham")
+    Color = sky_radiance_preetham(p, phi, theta, radiance, config_x, config_y, config_z);
 }
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 2c57a142692..1ae94f1d766 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -847,39 +847,29 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
 
-      if (sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
-        /* todo: giving a fixed weight here will cause issues when
-         * mixing multiple BSDFS. energy will not be conserved and
-         * the throughput can blow up after multiple bounces. we
-         * better figure out a way to skip backfaces from rays
-         * spawned by transmission from the front */
-        bsdf_transparent_setup(sd, make_float3(1.0f, 1.0f, 1.0f), path_flag);
-      }
-      else {
-        HairBsdf *bsdf = (HairBsdf *)bsdf_alloc(sd, sizeof(HairBsdf), weight);
+      HairBsdf *bsdf = (HairBsdf *)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
-        if (bsdf) {
-          bsdf->N = N;
-          bsdf->roughness1 = param1;
-          bsdf->roughness2 = param2;
-          bsdf->offset = -stack_load_float(stack, data_node.z);
+      if (bsdf) {
+        bsdf->N = N;
+        bsdf->roughness1 = param1;
+        bsdf->roughness2 = param2;
+        bsdf->offset = -stack_load_float(stack, data_node.z);
 
-          if (stack_valid(data_node.y)) {
-            bsdf->T = normalize(stack_load_float3(stack, data_node.y));
-          }
-          else if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
-            bsdf->T = normalize(sd->dPdv);
-            bsdf->offset = 0.0f;
-          }
-          else
-            bsdf->T = normalize(sd->dPdu);
+        if (stack_valid(data_node.y)) {
+          bsdf->T = normalize(stack_load_float3(stack, data_node.y));
+        }
+        else if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
+          bsdf->T = normalize(sd->dPdv);
+          bsdf->offset = 0.0f;
+        }
+        else
+          bsdf->T = normalize(sd->dPdu);
 
-          if (type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-            sd->flag |= bsdf_hair_reflection_setup(bsdf);
-          }
-          else {
-            sd->flag |= bsdf_hair_transmission_setup(bsdf);
-          }
+        if (type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
+          sd->flag |= bsdf_hair_reflection_setup(bsdf);
+        }
+        else {
+          sd->flag |= bsdf_hair_transmission_setup(bsdf);
         }
       }
 
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 019c6294082..77df19b2298 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -41,11 +41,9 @@ ccl_device_inline void svm_node_geometry(
     case NODE_GEOM_Ng:
       data = sd->Ng;
       break;
-#ifdef __UV__
     case NODE_GEOM_uv:
       data = make_float3(sd->u, sd->v, 0.0f);
       break;
-#endif
     default:
       data = make_float3(0.0f, 0.0f, 0.0f);
   }
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 914ef2089a9..7db8ffcc6e1 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -573,8 +573,8 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  *
  *    Point  Offset from v0
  *     v0      (0, 0, 0)
- *     v1      (0, 0, 1)    The full avx type is computed by inserting the following
- *     v2      (0, 1, 0)    sse types into both the low and high parts of the avx.
+ *     v1      (0, 0, 1)    The full AVX type is computed by inserting the following
+ *     v2      (0, 1, 0)    SSE types into both the low and high parts of the AVX.
  *     v3      (0, 1, 1)
  *     v4      (1, 0, 0)
  *     v5      (1, 0, 1)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index 50fe0c8232f..e877bd9a5c8 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,16 +37,16 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-ccl_device float3 sky_radiance_old(KernelGlobals *kg,
-                                   float3 dir,
-                                   float sunphi,
-                                   float suntheta,
-                                   float radiance_x,
-                                   float radiance_y,
-                                   float radiance_z,
-                                   float *config_x,
-                                   float *config_y,
-                                   float *config_z)
+ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+                                        float3 dir,
+                                        float sunphi,
+                                        float suntheta,
+                                        float radiance_x,
+                                        float radiance_y,
+                                        float radiance_z,
+                                        float *config_x,
+                                        float *config_y,
+                                        float *config_z)
 {
   /* convert vector to spherical coordinates */
   float2 spherical = direction_to_spherical(dir);
@@ -90,16 +90,16 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
           configuration[6] * mieM + configuration[7] * zenith);
 }
 
-ccl_device float3 sky_radiance_new(KernelGlobals *kg,
-                                   float3 dir,
-                                   float sunphi,
-                                   float suntheta,
-                                   float radiance_x,
-                                   float radiance_y,
-                                   float radiance_z,
-                                   float *config_x,
-                                   float *config_y,
-                                   float *config_z)
+ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+                                     float3 dir,
+                                     float sunphi,
+                                     float suntheta,
+                                     float radiance_x,
+                                     float radiance_y,
+                                     float radiance_z,
+                                     float *config_x,
+                                     float *config_y,
+                                     float *config_z)
 {
   /* convert vector to spherical coordinates */
   float2 spherical = direction_to_spherical(dir);
@@ -121,93 +121,206 @@ ccl_device float3 sky_radiance_new(KernelGlobals *kg,
   return xyz_to_rgb(kg, make_float3(x, y, z)) * (M_2PI_F / 683);
 }
 
+/* Nishita improved sky model */
+ccl_device float3 geographical_to_direction(float lat, float lon)
+{
+  return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
+}
+
+ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+                                       float3 dir,
+                                       float *nishita_data,
+                                       uint texture_id)
+{
+  /* definitions */
+  float sun_elevation = nishita_data[6];
+  float sun_rotation = nishita_data[7];
+  float angular_diameter = nishita_data[8];
+  bool sun_disc = (angular_diameter > 0.0f);
+  float3 xyz;
+  /* convert dir to spherical coordinates */
+  float2 direction = direction_to_spherical(dir);
+
+  /* render above the horizon */
+  if (dir.z >= 0.0f) {
+    /* definitions */
+    float3 sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2_F);
+    float sun_dir_angle = acos(dot(dir, sun_dir));
+    float half_angular = angular_diameter / 2.0f;
+    float dir_elevation = M_PI_2_F - direction.x;
+
+    /* if ray inside sun disc render it, otherwise render sky */
+    if (sun_disc && sun_dir_angle < half_angular) {
+      /* get 3 pixels data */
+      float3 pixel_bottom = make_float3(nishita_data[0], nishita_data[1], nishita_data[2]);
+      float3 pixel_top = make_float3(nishita_data[3], nishita_data[4], nishita_data[5]);
+      float y;
+
+      /* sun interpolation */
+      if (sun_elevation - half_angular > 0.0f) {
+        if (sun_elevation + half_angular > 0.0f) {
+          y = ((dir_elevation - sun_elevation) / angular_diameter) + 0.5f;
+          xyz = interp(pixel_bottom, pixel_top, y);
+        }
+      }
+      else {
+        if (sun_elevation + half_angular > 0.0f) {
+          y = dir_elevation / (sun_elevation + half_angular);
+          xyz = interp(pixel_bottom, pixel_top, y);
+        }
+      }
+      /* limb darkening, coefficient is 0.6f */
+      float limb_darkening = (1.0f -
+                              0.6f * (1.0f - sqrtf(1.0f - sqr(sun_dir_angle / half_angular))));
+      xyz *= limb_darkening;
+    }
+    /* sky */
+    else {
+      /* sky interpolation */
+      float x = (direction.y + M_PI_F + sun_rotation) / M_2PI_F;
+      float y = dir_elevation / M_PI_2_F;
+      if (x > 1.0f) {
+        x -= 1.0f;
+      }
+      xyz = float4_to_float3(kernel_tex_image_interp(kg, texture_id, x, y));
+    }
+  }
+  /* ground */
+  else {
+    if (dir.z < -0.4f) {
+      xyz = make_float3(0.0f, 0.0f, 0.0f);
+    }
+    else {
+      /* black ground fade */
+      float fade = 1.0f + dir.z * 2.5f;
+      fade = sqr(fade) * fade;
+      /* interpolation */
+      float x = (direction.y + M_PI_F + sun_rotation) / M_2PI_F;
+      if (x > 1.0f) {
+        x -= 1.0f;
+      }
+      xyz = float4_to_float3(kernel_tex_image_interp(kg, texture_id, x, -0.5)) * fade;
+    }
+  }
+
+  /* convert to rgb and adjust strength */
+  return xyz_to_rgb(kg, xyz) * 120000.0f;
+}
+
 ccl_device void svm_node_tex_sky(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
-  /* Define variables */
-  float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
-  float config_x[9], config_y[9], config_z[9];
-
   /* Load data */
   uint dir_offset = node.y;
   uint out_offset = node.z;
   int sky_model = node.w;
 
-  float4 data = read_node_float(kg, offset);
-  sunphi = data.x;
-  suntheta = data.y;
-  radiance_x = data.z;
-  radiance_y = data.w;
-
-  data = read_node_float(kg, offset);
-  radiance_z = data.x;
-  config_x[0] = data.y;
-  config_x[1] = data.z;
-  config_x[2] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_x[3] = data.x;
-  config_x[4] = data.y;
-  config_x[5] = data.z;
-  config_x[6] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_x[7] = data.x;
-  config_x[8] = data.y;
-  config_y[0] = data.z;
-  config_y[1] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_y[2] = data.x;
-  config_y[3] = data.y;
-  config_y[4] = data.z;
-  config_y[5] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_y[6] = data.x;
-  config_y[7] = data.y;
-  config_y[8] = data.z;
-  config_z[0] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_z[1] = data.x;
-  config_z[2] = data.y;
-  config_z[3] = data.z;
-  config_z[4] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_z[5] = data.x;
-  config_z[6] = data.y;
-  config_z[7] = data.z;
-  config_z[8] = data.w;
-
   float3 dir = stack_load_float3(stack, dir_offset);
   float3 f;
 
-  /* Compute Sky */
-  if (sky_model == 0) {
-    f = sky_radiance_old(kg,
-                         dir,
-                         sunphi,
-                         suntheta,
-                         radiance_x,
-                         radiance_y,
-                         radiance_z,
-                         config_x,
-                         config_y,
-                         config_z);
+  /* Preetham and Hosek share the same data */
+  if (sky_model == 0 || sky_model == 1) {
+    /* Define variables */
+    float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
+    float config_x[9], config_y[9], config_z[9];
+
+    float4 data = read_node_float(kg, offset);
+    sunphi = data.x;
+    suntheta = data.y;
+    radiance_x = data.z;
+    radiance_y = data.w;
+
+    data = read_node_float(kg, offset);
+    radiance_z = data.x;
+    config_x[0] = data.y;
+    config_x[1] = data.z;
+    config_x[2] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_x[3] = data.x;
+    config_x[4] = data.y;
+    config_x[5] = data.z;
+    config_x[6] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_x[7] = data.x;
+    config_x[8] = data.y;
+    config_y[0] = data.z;
+    config_y[1] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_y[2] = data.x;
+    config_y[3] = data.y;
+    config_y[4] = data.z;
+    config_y[5] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_y[6] = data.x;
+    config_y[7] = data.y;
+    config_y[8] = data.z;
+    config_z[0] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_z[1] = data.x;
+    config_z[2] = data.y;
+    config_z[3] = data.z;
+    config_z[4] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_z[5] = data.x;
+    config_z[6] = data.y;
+    config_z[7] = data.z;
+    config_z[8] = data.w;
+
+    /* Compute Sky */
+    if (sky_model == 0) {
+      f = sky_radiance_preetham(kg,
+                                dir,
+                                sunphi,
+                                suntheta,
+                                radiance_x,
+                                radiance_y,
+                                radiance_z,
+                                config_x,
+                                config_y,
+                                config_z);
+    }
+    else {
+      f = sky_radiance_hosek(kg,
+                             dir,
+                             sunphi,
+                             suntheta,
+                             radiance_x,
+                             radiance_y,
+                             radiance_z,
+                             config_x,
+                             config_y,
+                             config_z);
+    }
   }
+  /* Nishita */
   else {
-    f = sky_radiance_new(kg,
-                         dir,
-                         sunphi,
-                         suntheta,
-                         radiance_x,
-                         radiance_y,
-                         radiance_z,
-                         config_x,
-                         config_y,
-                         config_z);
+    /* Define variables */
+    float nishita_data[9];
+
+    float4 data = read_node_float(kg, offset);
+    nishita_data[0] = data.x;
+    nishita_data[1] = data.y;
+    nishita_data[2] = data.z;
+    nishita_data[3] = data.w;
+
+    data = read_node_float(kg, offset);
+    nishita_data[4] = data.x;
+    nishita_data[5] = data.y;
+    nishita_data[6] = data.z;
+    nishita_data[7] = data.w;
+
+    data = read_node_float(kg, offset);
+    nishita_data[8] = data.x;
+    uint texture_id = __float_as_uint(data.y);
+
+    /* Compute Sky */
+    f = sky_radiance_nishita(kg, dir, nishita_data, texture_id);
   }
 
   stack_store_float3(stack, out_offset, f);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index e913d9e0489..f1ebb37e23e 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -414,7 +414,7 @@ typedef enum NodeWaveProfile {
   NODE_WAVE_PROFILE_TRI,
 } NodeWaveProfile;
 
-typedef enum NodeSkyType { NODE_SKY_OLD, NODE_SKY_NEW } NodeSkyType;
+typedef enum NodeSkyType { NODE_SKY_PREETHAM, NODE_SKY_HOSEK, NODE_SKY_NISHITA } NodeSkyType;
 
 typedef enum NodeGradientType {
   NODE_BLEND_LINEAR,
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 472b5a0c101..e37a0407976 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -24,6 +24,7 @@ set(SRC
   hair.cpp
   image.cpp
   image_oiio.cpp
+  image_sky.cpp
   image_vdb.cpp
   integrator.cpp
   jitter.cpp
@@ -64,6 +65,7 @@ set(SRC_HEADERS
   hair.h
   image.h
   image_oiio.h
+  image_sky.h
   image_vdb.h
   integrator.h
   light.h
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index 1907bb33d06..db48d8b6430 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -36,13 +36,12 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim)
   float *p2 = &p[2].x;
   float *p3 = &p[3].x;
 
-  float fc = 0.71f;
+  /* Catmull-Rom weights. */
   float curve_coef[4];
   curve_coef[0] = p1[dim];
-  curve_coef[1] = -fc * p0[dim] + fc * p2[dim];
-  curve_coef[2] = 2.0f * fc * p0[dim] + (fc - 3.0f) * p1[dim] + (3.0f - 2.0f * fc) * p2[dim] -
-                  fc * p3[dim];
-  curve_coef[3] = -fc * p0[dim] + (2.0f - fc) * p1[dim] + (fc - 2.0f) * p2[dim] + fc * p3[dim];
+  curve_coef[1] = 0.5f * (-p0[dim] + p2[dim]);
+  curve_coef[2] = 0.5f * (2 * p0[dim] - 5 * p1[dim] + 4 * p2[dim] - p3[dim]);
+  curve_coef[3] = 0.5f * (-p0[dim] + 3 * p1[dim] - 3 * p2[dim] + p3[dim]);
 
   float discroot = curve_coef[2] * curve_coef[2] - 3 * curve_coef[3] * curve_coef[1];
   float ta = -1.0f;
@@ -77,105 +76,4 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim)
   *lower = min(*lower, min(exa, exb));
 }
 
-/* Hair System Manager */
-
-CurveSystemManager::CurveSystemManager()
-{
-  primitive = CURVE_LINE_SEGMENTS;
-  curve_shape = CURVE_THICK;
-  line_method = CURVE_CORRECTED;
-  triangle_method = CURVE_CAMERA_TRIANGLES;
-  resolution = 3;
-  subdivisions = 3;
-
-  use_curves = true;
-  use_encasing = true;
-  use_backfacing = false;
-  use_tangent_normal_geometry = false;
-
-  need_update = true;
-  need_mesh_update = false;
-}
-
-CurveSystemManager::~CurveSystemManager()
-{
-}
-
-void CurveSystemManager::device_update(Device *device,
-                                       DeviceScene *dscene,
-                                       Scene * /*scene*/,
-                                       Progress &progress)
-{
-  if (!need_update)
-    return;
-
-  device_free(device, dscene);
-
-  progress.set_status("Updating Hair settings", "Copying Hair settings to device");
-
-  KernelCurves *kcurve = &dscene->data.curve;
-
-  kcurve->curveflags = 0;
-
-  if (use_curves) {
-    if (primitive == CURVE_SEGMENTS || primitive == CURVE_RIBBONS)
-      kcurve->curveflags |= CURVE_KN_INTERPOLATE;
-    if (primitive == CURVE_RIBBONS)
-      kcurve->curveflags |= CURVE_KN_RIBBONS;
-
-    if (line_method == CURVE_ACCURATE)
-      kcurve->curveflags |= CURVE_KN_ACCURATE;
-    else if (line_method == CURVE_CORRECTED)
-      kcurve->curveflags |= CURVE_KN_INTERSECTCORRECTION;
-
-    if (use_tangent_normal_geometry)
-      kcurve->curveflags |= CURVE_KN_TRUETANGENTGNORMAL;
-    if (use_backfacing)
-      kcurve->curveflags |= CURVE_KN_BACKFACING;
-    if (use_encasing)
-      kcurve->curveflags |= CURVE_KN_ENCLOSEFILTER;
-
-    kcurve->subdivisions = subdivisions;
-  }
-
-  if (progress.get_cancel())
-    return;
-
-  need_update = false;
-}
-
-void CurveSystemManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
-{
-}
-
-bool CurveSystemManager::modified(const CurveSystemManager &CurveSystemManager)
-{
-  return !(
-      curve_shape == CurveSystemManager.curve_shape &&
-      line_method == CurveSystemManager.line_method && primitive == CurveSystemManager.primitive &&
-      use_encasing == CurveSystemManager.use_encasing &&
-      use_tangent_normal_geometry == CurveSystemManager.use_tangent_normal_geometry &&
-      use_backfacing == CurveSystemManager.use_backfacing &&
-      triangle_method == CurveSystemManager.triangle_method &&
-      resolution == CurveSystemManager.resolution && use_curves == CurveSystemManager.use_curves &&
-      subdivisions == CurveSystemManager.subdivisions);
-}
-
-bool CurveSystemManager::modified_mesh(const CurveSystemManager &CurveSystemManager)
-{
-  return !(
-      primitive == CurveSystemManager.primitive && curve_shape == CurveSystemManager.curve_shape &&
-      triangle_method == CurveSystemManager.triangle_method &&
-      resolution == CurveSystemManager.resolution && use_curves == CurveSystemManager.use_curves);
-}
-
-void CurveSystemManager::tag_update(Scene * /*scene*/)
-{
-  need_update = true;
-}
-
-void CurveSystemManager::tag_update_mesh()
-{
-  need_mesh_update = true;
-}
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index ade289a402e..c52fcb9c882 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -20,6 +20,8 @@
 #include "util/util_array.h"
 #include "util/util_types.h"
 
+#include "render/hair.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Device;
@@ -29,33 +31,6 @@ class Scene;
 
 void curvebounds(float *lower, float *upper, float3 *p, int dim);
 
-typedef enum CurvePrimitiveType {
-  CURVE_TRIANGLES = 0,
-  CURVE_LINE_SEGMENTS = 1,
-  CURVE_SEGMENTS = 2,
-  CURVE_RIBBONS = 3,
-
-  CURVE_NUM_PRIMITIVE_TYPES,
-} CurvePrimitiveType;
-
-typedef enum CurveShapeType {
-  CURVE_RIBBON = 0,
-  CURVE_THICK = 1,
-
-  CURVE_NUM_SHAPE_TYPES,
-} CurveShapeType;
-
-typedef enum CurveTriangleMethod {
-  CURVE_CAMERA_TRIANGLES,
-  CURVE_TESSELATED_TRIANGLES
-} CurveTriangleMethod;
-
-typedef enum CurveLineMethod {
-  CURVE_ACCURATE,
-  CURVE_CORRECTED,
-  CURVE_UNCORRECTED
-} CurveLineMethod;
-
 class ParticleCurveData {
 
  public:
@@ -75,43 +50,12 @@ class ParticleCurveData {
   array<int> curve_keynum;
   array<float> curve_length;
   array<float2> curve_uv;
-  array<float3> curve_vcol;
+  array<float4> curve_vcol;
 
   array<float3> curvekey_co;
   array<float> curvekey_time;
 };
 
-/* HairSystem Manager */
-
-class CurveSystemManager {
- public:
-  CurvePrimitiveType primitive;
-  CurveShapeType curve_shape;
-  CurveLineMethod line_method;
-  CurveTriangleMethod triangle_method;
-  int resolution;
-  int subdivisions;
-
-  bool use_curves;
-  bool use_encasing;
-  bool use_backfacing;
-  bool use_tangent_normal_geometry;
-
-  bool need_update;
-  bool need_mesh_update;
-
-  CurveSystemManager();
-  ~CurveSystemManager();
-
-  void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-  void device_free(Device *device, DeviceScene *dscene);
-  bool modified(const CurveSystemManager &CurveSystemManager);
-  bool modified_mesh(const CurveSystemManager &CurveSystemManager);
-
-  void tag_update(Scene *scene);
-  void tag_update_mesh();
-};
-
 CCL_NAMESPACE_END
 
 #endif /* __CURVES_H__ */
diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp
index 4d819d1119e..4055bc4773b 100644
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -21,6 +21,7 @@
 #include "util/util_foreach.h"
 #include "util/util_map.h"
 #include "util/util_system.h"
+#include "util/util_task.h"
 #include "util/util_time.h"
 
 #include <OpenImageIO/filesystem.h>
@@ -377,8 +378,9 @@ void DenoiseTask::create_task(DeviceTask &task)
 
   /* Denoising parameters. */
   task.denoising = denoiser->params;
-  task.denoising_do_filter = true;
-  task.denoising_write_passes = false;
+  task.denoising.type = DENOISER_NLM;
+  task.denoising.use = true;
+  task.denoising.store_passes = false;
   task.denoising_from_render = false;
 
   task.denoising_frames.resize(neighbor_frames.size());
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
index d46ed430c4f..3d1b6e1d865 100644
--- a/intern/cycles/render/geometry.cpp
+++ b/intern/cycles/render/geometry.cpp
@@ -16,10 +16,9 @@
 
 #include "bvh/bvh.h"
 #include "bvh/bvh_build.h"
+#include "bvh/bvh_embree.h"
 
-#ifdef WITH_EMBREE
-#  include "bvh/bvh_embree.h"
-#endif
+#include "device/device.h"
 
 #include "render/attribute.h"
 #include "render/camera.h"
@@ -212,8 +211,7 @@ void Geometry::compute_bvh(
       bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
       bparams.num_motion_curve_steps = params->num_bvh_time_steps;
       bparams.bvh_type = params->bvh_type;
-      bparams.curve_flags = dscene->data.curve.curveflags;
-      bparams.curve_subdivisions = dscene->data.curve.subdivisions;
+      bparams.curve_subdivisions = params->curve_subdivisions();
 
       delete bvh;
       bvh = BVH::create(bparams, geometry, objects);
@@ -1027,28 +1025,18 @@ void GeometryManager::device_update_bvh(Device *device,
   bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
   bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
   bparams.bvh_type = scene->params.bvh_type;
-  bparams.curve_flags = dscene->data.curve.curveflags;
-  bparams.curve_subdivisions = dscene->data.curve.subdivisions;
+  bparams.curve_subdivisions = scene->params.curve_subdivisions();
 
   VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout.";
 
-#ifdef WITH_EMBREE
-  if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-    if (dscene->data.bvh.scene) {
-      BVHEmbree::destroy(dscene->data.bvh.scene);
-    }
-  }
-#endif
-
   BVH *bvh = BVH::create(bparams, scene->geometry, scene->objects);
   bvh->build(progress, &device->stats);
 
   if (progress.get_cancel()) {
 #ifdef WITH_EMBREE
-    if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-      if (dscene->data.bvh.scene) {
-        BVHEmbree::destroy(dscene->data.bvh.scene);
-      }
+    if (dscene->data.bvh.scene) {
+      BVHEmbree::destroy(dscene->data.bvh.scene);
+      dscene->data.bvh.scene = NULL;
     }
 #endif
     delete bvh;
@@ -1104,6 +1092,7 @@ void GeometryManager::device_update_bvh(Device *device,
   dscene->data.bvh.root = pack.root_index;
   dscene->data.bvh.bvh_layout = bparams.bvh_layout;
   dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
+  dscene->data.bvh.curve_subdivisions = scene->params.curve_subdivisions();
 
   bvh->copy_to_device(progress, dscene);
 
@@ -1146,6 +1135,12 @@ void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Pro
         create_volume_mesh(mesh, progress);
       }
     }
+
+    if (geom->type == Geometry::HAIR) {
+      /* Set curve shape, still a global scene setting for now. */
+      Hair *hair = static_cast<Hair *>(geom);
+      hair->curve_shape = scene->params.hair_shape;
+    }
   }
 
   need_flags_update = false;
@@ -1413,6 +1408,14 @@ void GeometryManager::device_update(Device *device,
 
 void GeometryManager::device_free(Device *device, DeviceScene *dscene)
 {
+#ifdef WITH_EMBREE
+  if (dscene->data.bvh.scene) {
+    if (dscene->data.bvh.bvh_layout == BVH_LAYOUT_EMBREE)
+      BVHEmbree::destroy(dscene->data.bvh.scene);
+    dscene->data.bvh.scene = NULL;
+  }
+#endif
+
   dscene->bvh_nodes.free();
   dscene->bvh_leaf_nodes.free();
   dscene->object_node.free();
diff --git a/intern/cycles/render/hair.cpp b/intern/cycles/render/hair.cpp
index 3daa4cc1e35..816c15cf4ef 100644
--- a/intern/cycles/render/hair.cpp
+++ b/intern/cycles/render/hair.cpp
@@ -294,6 +294,7 @@ NODE_DEFINE(Hair)
 Hair::Hair() : Geometry(node_type, Geometry::HAIR)
 {
   curvekey_offset = 0;
+  curve_shape = CURVE_RIBBON;
 }
 
 Hair::~Hair()
diff --git a/intern/cycles/render/hair.h b/intern/cycles/render/hair.h
index 79f77a78753..39d6a34d799 100644
--- a/intern/cycles/render/hair.h
+++ b/intern/cycles/render/hair.h
@@ -96,6 +96,7 @@ class Hair : public Geometry {
 
   /* BVH */
   size_t curvekey_offset;
+  CurveShapeType curve_shape;
 
   /* Constructor/Destructor */
   Hair();
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 75050b66bf2..8d187814d64 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -27,6 +27,7 @@
 #include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_progress.h"
+#include "util/util_task.h"
 #include "util/util_texture.h"
 #include "util/util_unique_ptr.h"
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 2000582ce70..fffe7c5152a 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -17,7 +17,6 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 
-#include "device/device.h"
 #include "device/device_memory.h"
 
 #include "render/colorspace.h"
@@ -31,6 +30,7 @@
 CCL_NAMESPACE_BEGIN
 
 class Device;
+class DeviceInfo;
 class ImageHandle;
 class ImageKey;
 class ImageMetaData;
diff --git a/intern/cycles/render/image_sky.cpp b/intern/cycles/render/image_sky.cpp
new file mode 100644
index 00000000000..442e1d7941f
--- /dev/null
+++ b/intern/cycles/render/image_sky.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image_sky.h"
+
+#include "util/util_image.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_sky_model.h"
+#include "util/util_task.h"
+
+CCL_NAMESPACE_BEGIN
+
+SkyLoader::SkyLoader(
+    float sun_elevation, int altitude, float air_density, float dust_density, float ozone_density)
+    : sun_elevation(sun_elevation),
+      altitude(altitude),
+      air_density(air_density),
+      dust_density(dust_density),
+      ozone_density(ozone_density)
+{
+}
+
+SkyLoader::~SkyLoader(){};
+
+bool SkyLoader::load_metadata(ImageMetaData &metadata)
+{
+  metadata.width = 512;
+  metadata.height = 128;
+  metadata.channels = 3;
+  metadata.depth = 1;
+  metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+  metadata.compress_as_srgb = false;
+  return true;
+}
+
+bool SkyLoader::load_pixels(const ImageMetaData &metadata,
+                            void *pixels,
+                            const size_t /*pixels_size*/,
+                            const bool /*associate_alpha*/)
+{
+  /* definitions */
+  int width = metadata.width;
+  int height = metadata.height;
+  float *pixel_data = (float *)pixels;
+  float altitude_f = (float)altitude;
+
+  /* precompute sky texture */
+  const int rows_per_task = divide_up(1024, width);
+  parallel_for(blocked_range<size_t>(0, height, rows_per_task),
+               [&](const blocked_range<size_t> &r) {
+                 nishita_skymodel_precompute_texture(pixel_data,
+                                                     metadata.channels,
+                                                     r.begin(),
+                                                     r.end(),
+                                                     width,
+                                                     height,
+                                                     sun_elevation,
+                                                     altitude_f,
+                                                     air_density,
+                                                     dust_density,
+                                                     ozone_density);
+               });
+
+  return true;
+}
+
+string SkyLoader::name() const
+{
+  return "sky_nishita";
+}
+
+bool SkyLoader::equals(const ImageLoader & /*other*/) const
+{
+  return false;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_sky.h b/intern/cycles/render/image_sky.h
new file mode 100644
index 00000000000..cf4a3e8942c
--- /dev/null
+++ b/intern/cycles/render/image_sky.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SkyLoader : public ImageLoader {
+ private:
+  float sun_elevation;
+  int altitude;
+  float air_density;
+  float dust_density;
+  float ozone_density;
+
+ public:
+  SkyLoader(float sun_elevation,
+            int altitude,
+            float air_density,
+            float dust_density,
+            float ozone_density);
+  ~SkyLoader();
+
+  bool load_metadata(ImageMetaData &metadata) override;
+
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t /*pixels_size*/,
+                   const bool /*associate_alpha*/) override;
+
+  string name() const override;
+
+  bool equals(const ImageLoader & /*other*/) const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index d4beb06e57b..eff416efa2b 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -29,6 +29,7 @@
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_logging.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index cb7474017fa..c0615c6217b 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -31,6 +31,7 @@
 #include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_progress.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -450,6 +451,7 @@ void LightManager::device_update_distribution(Device *,
 
   /* update device */
   KernelIntegrator *kintegrator = &dscene->data.integrator;
+  KernelBackground *kbackground = &dscene->data.background;
   KernelFilm *kfilm = &dscene->data.film;
   kintegrator->use_direct_light = (totarea > 0.0f);
 
@@ -493,15 +495,18 @@ void LightManager::device_update_distribution(Device *,
 
     /* Portals */
     if (num_portals > 0) {
-      kintegrator->portal_offset = light_index;
-      kintegrator->num_portals = num_portals;
-      kintegrator->portal_pdf = background_mis ? 0.5f : 1.0f;
+      kbackground->portal_offset = light_index;
+      kbackground->num_portals = num_portals;
+      kbackground->portal_weight = 1.0f;
     }
     else {
-      kintegrator->num_portals = 0;
-      kintegrator->portal_offset = 0;
-      kintegrator->portal_pdf = 0.0f;
+      kbackground->num_portals = 0;
+      kbackground->portal_offset = 0;
+      kbackground->portal_weight = 0.0f;
     }
+
+    /* Map */
+    kbackground->map_weight = background_mis ? 1.0f : 0.0f;
   }
   else {
     dscene->light_distribution.free();
@@ -511,9 +516,12 @@ void LightManager::device_update_distribution(Device *,
     kintegrator->pdf_triangles = 0.0f;
     kintegrator->pdf_lights = 0.0f;
     kintegrator->use_lamp_mis = false;
-    kintegrator->num_portals = 0;
-    kintegrator->portal_offset = 0;
-    kintegrator->portal_pdf = 0.0f;
+
+    kbackground->num_portals = 0;
+    kbackground->portal_offset = 0;
+    kbackground->portal_weight = 0.0f;
+    kbackground->sun_weight = 0.0f;
+    kbackground->map_weight = 0.0f;
 
     kfilm->pass_shadow_scale = 1.0f;
   }
@@ -562,7 +570,7 @@ void LightManager::device_update_background(Device *device,
                                             Scene *scene,
                                             Progress &progress)
 {
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
+  KernelBackground *kbackground = &dscene->data.background;
   Light *background_light = NULL;
 
   /* find background light */
@@ -575,31 +583,79 @@ void LightManager::device_update_background(Device *device,
 
   /* no background light found, signal renderer to skip sampling */
   if (!background_light || !background_light->is_enabled) {
-    kintegrator->pdf_background_res_x = 0;
-    kintegrator->pdf_background_res_y = 0;
+    kbackground->map_res_x = 0;
+    kbackground->map_res_y = 0;
+    kbackground->map_weight = 0.0f;
+    kbackground->sun_weight = 0.0f;
+    kbackground->use_mis = (kbackground->portal_weight > 0.0f);
     return;
   }
 
   progress.set_status("Updating Lights", "Importance map");
 
-  assert(kintegrator->use_direct_light);
+  assert(dscene->data.integrator.use_direct_light);
+
+  int2 environment_res = make_int2(0, 0);
+  Shader *shader = scene->background->get_shader(scene);
+  int num_suns = 0;
+  foreach (ShaderNode *node, shader->graph->nodes) {
+    if (node->type == EnvironmentTextureNode::node_type) {
+      EnvironmentTextureNode *env = (EnvironmentTextureNode *)node;
+      ImageMetaData metadata;
+      if (!env->handle.empty()) {
+        ImageMetaData metadata = env->handle.metadata();
+        environment_res.x = max(environment_res.x, metadata.width);
+        environment_res.y = max(environment_res.y, metadata.height);
+      }
+    }
+    if (node->type == SkyTextureNode::node_type) {
+      SkyTextureNode *sky = (SkyTextureNode *)node;
+      if (sky->type == NODE_SKY_NISHITA && sky->sun_disc) {
+        /* Ensure that the input coordinates aren't transformed before they reach the node.
+         * If that is the case, the logic used for sampling the sun's location does not work
+         * and we have to fall back to map-based sampling. */
+        const ShaderInput *vec_in = sky->input("Vector");
+        if (vec_in && vec_in->link && vec_in->link->parent) {
+          ShaderNode *vec_src = vec_in->link->parent;
+          if ((vec_src->type != TextureCoordinateNode::node_type) ||
+              (vec_in->link != vec_src->output("Generated"))) {
+            environment_res.x = max(environment_res.x, 4096);
+            environment_res.y = max(environment_res.y, 2048);
+            continue;
+          }
+        }
+
+        float latitude = sky->sun_elevation;
+        float longitude = M_2PI_F - sky->sun_rotation + M_PI_2_F;
+        float half_angle = sky->sun_size * 0.5f;
+        kbackground->sun = make_float4(cosf(latitude) * cosf(longitude),
+                                       cosf(latitude) * sinf(longitude),
+                                       sinf(latitude),
+                                       half_angle);
+        kbackground->sun_weight = 4.0f;
+        environment_res.x = max(environment_res.x, 512);
+        environment_res.y = max(environment_res.y, 256);
+        num_suns++;
+      }
+    }
+  }
+
+  /* If there's more than one sun, fall back to map sampling instead. */
+  if (num_suns != 1) {
+    kbackground->sun_weight = 0.0f;
+    environment_res.x = max(environment_res.x, 4096);
+    environment_res.y = max(environment_res.y, 2048);
+  }
+
+  /* Enable MIS for background sampling if any strategy is active. */
+  kbackground->use_mis = (kbackground->portal_weight + kbackground->map_weight +
+                          kbackground->sun_weight) > 0.0f;
 
   /* get the resolution from the light's size (we stuff it in there) */
   int2 res = make_int2(background_light->map_resolution, background_light->map_resolution / 2);
   /* If the resolution isn't set manually, try to find an environment texture. */
   if (res.x == 0) {
-    Shader *shader = scene->background->get_shader(scene);
-    foreach (ShaderNode *node, shader->graph->nodes) {
-      if (node->type == EnvironmentTextureNode::node_type) {
-        EnvironmentTextureNode *env = (EnvironmentTextureNode *)node;
-        ImageMetaData metadata;
-        if (!env->handle.empty()) {
-          ImageMetaData metadata = env->handle.metadata();
-          res.x = max(res.x, metadata.width);
-          res.y = max(res.y, metadata.height);
-        }
-      }
-    }
+    res = environment_res;
     if (res.x > 0 && res.y > 0) {
       VLOG(2) << "Automatically set World MIS resolution to " << res.x << " by " << res.y << "\n";
     }
@@ -609,8 +665,8 @@ void LightManager::device_update_background(Device *device,
     res = make_int2(1024, 512);
     VLOG(2) << "Setting World MIS resolution to default\n";
   }
-  kintegrator->pdf_background_res_x = res.x;
-  kintegrator->pdf_background_res_y = res.y;
+  kbackground->map_res_x = res.x;
+  kbackground->map_res_y = res.y;
 
   vector<float3> pixels;
   shade_background_pixels(device, dscene, res.x, res.y, pixels, progress);
@@ -624,29 +680,13 @@ void LightManager::device_update_background(Device *device,
   float2 *cond_cdf = dscene->light_background_conditional_cdf.alloc(cdf_width * res.y);
 
   double time_start = time_dt();
-  if (max(res.x, res.y) < 512) {
-    /* Small enough resolution, faster to do single-threaded. */
-    background_cdf(0, res.y, res.x, res.y, &pixels, cond_cdf);
-  }
-  else {
-    /* Threaded evaluation for large resolution. */
-    const int num_blocks = TaskScheduler::num_threads();
-    const int chunk_size = res.y / num_blocks;
-    int start_row = 0;
-    TaskPool pool;
-    for (int i = 0; i < num_blocks; ++i) {
-      const int current_chunk_size = (i != num_blocks - 1) ? chunk_size : (res.y - i * chunk_size);
-      pool.push(function_bind(&background_cdf,
-                              start_row,
-                              start_row + current_chunk_size,
-                              res.x,
-                              res.y,
-                              &pixels,
-                              cond_cdf));
-      start_row += current_chunk_size;
-    }
-    pool.wait_work();
-  }
+
+  /* Create CDF in parallel. */
+  const int rows_per_task = divide_up(10240, res.x);
+  parallel_for(blocked_range<size_t>(0, res.y, rows_per_task),
+               [&](const blocked_range<size_t> &r) {
+                 background_cdf(r.begin(), r.end(), res.x, res.y, &pixels, cond_cdf);
+               });
 
   /* marginal CDFs (column, V direction, sum of rows) */
   marg_cdf[0].x = cond_cdf[res.x].x;
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index cdcaeb246dd..ab392839e52 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -19,6 +19,7 @@
 #include "render/constant_fold.h"
 #include "render/film.h"
 #include "render/image.h"
+#include "render/image_sky.h"
 #include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
@@ -630,7 +631,7 @@ typedef struct SunSky {
 
   /* Parameter */
   float radiance_x, radiance_y, radiance_z;
-  float config_x[9], config_y[9], config_z[9];
+  float config_x[9], config_y[9], config_z[9], nishita_data[9];
 } SunSky;
 
 /* Preetham model */
@@ -640,7 +641,7 @@ static float sky_perez_function(float lam[6], float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cosf(gamma) * cosf(gamma));
 }
 
-static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidity)
+static void sky_texture_precompute_preetham(SunSky *sunsky, float3 dir, float turbidity)
 {
   /*
    * We re-use the SunSky struct of the new model, to avoid extra variables
@@ -703,10 +704,10 @@ static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidi
 }
 
 /* Hosek / Wilkie */
-static void sky_texture_precompute_new(SunSky *sunsky,
-                                       float3 dir,
-                                       float turbidity,
-                                       float ground_albedo)
+static void sky_texture_precompute_hosek(SunSky *sunsky,
+                                         float3 dir,
+                                         float turbidity,
+                                         float ground_albedo)
 {
   /* Calculate Sun Direction and save coordinates */
   float2 spherical = sky_spherical_coordinates(dir);
@@ -743,6 +744,34 @@ static void sky_texture_precompute_new(SunSky *sunsky,
   arhosekskymodelstate_free(sky_state);
 }
 
+/* Nishita improved */
+static void sky_texture_precompute_nishita(SunSky *sunsky,
+                                           bool sun_disc,
+                                           float sun_size,
+                                           float sun_elevation,
+                                           float sun_rotation,
+                                           int altitude,
+                                           float air_density,
+                                           float dust_density)
+{
+  /* sample 2 sun pixels */
+  float pixel_bottom[3];
+  float pixel_top[3];
+  float altitude_f = (float)altitude;
+  nishita_skymodel_precompute_sun(
+      sun_elevation, sun_size, altitude_f, air_density, dust_density, pixel_bottom, pixel_top);
+  /* send data to svm_sky */
+  sunsky->nishita_data[0] = pixel_bottom[0];
+  sunsky->nishita_data[1] = pixel_bottom[1];
+  sunsky->nishita_data[2] = pixel_bottom[2];
+  sunsky->nishita_data[3] = pixel_top[0];
+  sunsky->nishita_data[4] = pixel_top[1];
+  sunsky->nishita_data[5] = pixel_top[2];
+  sunsky->nishita_data[6] = sun_elevation;
+  sunsky->nishita_data[7] = M_2PI_F - sun_rotation;
+  sunsky->nishita_data[8] = sun_disc ? sun_size : 0.0f;
+}
+
 NODE_DEFINE(SkyTextureNode)
 {
   NodeType *type = NodeType::add("sky_texture", create, NodeType::SHADER);
@@ -750,13 +779,22 @@ NODE_DEFINE(SkyTextureNode)
   TEXTURE_MAPPING_DEFINE(SkyTextureNode);
 
   static NodeEnum type_enum;
-  type_enum.insert("preetham", NODE_SKY_OLD);
-  type_enum.insert("hosek_wilkie", NODE_SKY_NEW);
-  SOCKET_ENUM(type, "Type", type_enum, NODE_SKY_NEW);
+  type_enum.insert("preetham", NODE_SKY_PREETHAM);
+  type_enum.insert("hosek_wilkie", NODE_SKY_HOSEK);
+  type_enum.insert("nishita_improved", NODE_SKY_NISHITA);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_SKY_NISHITA);
 
   SOCKET_VECTOR(sun_direction, "Sun Direction", make_float3(0.0f, 0.0f, 1.0f));
   SOCKET_FLOAT(turbidity, "Turbidity", 2.2f);
   SOCKET_FLOAT(ground_albedo, "Ground Albedo", 0.3f);
+  SOCKET_BOOLEAN(sun_disc, "Sun Disc", true);
+  SOCKET_FLOAT(sun_size, "Sun Size", 0.009512f);
+  SOCKET_FLOAT(sun_elevation, "Sun Elevation", M_PI_2_F);
+  SOCKET_FLOAT(sun_rotation, "Sun Rotation", 0.0f);
+  SOCKET_INT(altitude, "Altitude", 0);
+  SOCKET_FLOAT(air_density, "Air", 1.0f);
+  SOCKET_FLOAT(dust_density, "Dust", 1.0f);
+  SOCKET_FLOAT(ozone_density, "Ozone", 1.0f);
 
   SOCKET_IN_POINT(
       vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
@@ -776,10 +814,32 @@ void SkyTextureNode::compile(SVMCompiler &compiler)
   ShaderOutput *color_out = output("Color");
 
   SunSky sunsky;
-  if (type == NODE_SKY_OLD)
-    sky_texture_precompute_old(&sunsky, sun_direction, turbidity);
-  else if (type == NODE_SKY_NEW)
-    sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo);
+  if (type == NODE_SKY_PREETHAM)
+    sky_texture_precompute_preetham(&sunsky, sun_direction, turbidity);
+  else if (type == NODE_SKY_HOSEK)
+    sky_texture_precompute_hosek(&sunsky, sun_direction, turbidity, ground_albedo);
+  else if (type == NODE_SKY_NISHITA) {
+    sky_texture_precompute_nishita(&sunsky,
+                                   sun_disc,
+                                   sun_size,
+                                   sun_elevation,
+                                   sun_rotation,
+                                   altitude,
+                                   air_density,
+                                   dust_density);
+    /* precomputed texture image parameters */
+    ImageManager *image_manager = compiler.scene->image_manager;
+    ImageParams impar;
+    impar.interpolation = INTERPOLATION_LINEAR;
+    impar.extension = EXTENSION_EXTEND;
+
+    /* precompute sky texture */
+    if (handle.empty()) {
+      SkyLoader *loader = new SkyLoader(
+          sun_elevation, altitude, air_density, dust_density, ozone_density);
+      handle = image_manager->add_image(loader, impar);
+    }
+  }
   else
     assert(false);
 
@@ -787,38 +847,52 @@ void SkyTextureNode::compile(SVMCompiler &compiler)
 
   compiler.stack_assign(color_out);
   compiler.add_node(NODE_TEX_SKY, vector_offset, compiler.stack_assign(color_out), type);
-  compiler.add_node(__float_as_uint(sunsky.phi),
-                    __float_as_uint(sunsky.theta),
-                    __float_as_uint(sunsky.radiance_x),
-                    __float_as_uint(sunsky.radiance_y));
-  compiler.add_node(__float_as_uint(sunsky.radiance_z),
-                    __float_as_uint(sunsky.config_x[0]),
-                    __float_as_uint(sunsky.config_x[1]),
-                    __float_as_uint(sunsky.config_x[2]));
-  compiler.add_node(__float_as_uint(sunsky.config_x[3]),
-                    __float_as_uint(sunsky.config_x[4]),
-                    __float_as_uint(sunsky.config_x[5]),
-                    __float_as_uint(sunsky.config_x[6]));
-  compiler.add_node(__float_as_uint(sunsky.config_x[7]),
-                    __float_as_uint(sunsky.config_x[8]),
-                    __float_as_uint(sunsky.config_y[0]),
-                    __float_as_uint(sunsky.config_y[1]));
-  compiler.add_node(__float_as_uint(sunsky.config_y[2]),
-                    __float_as_uint(sunsky.config_y[3]),
-                    __float_as_uint(sunsky.config_y[4]),
-                    __float_as_uint(sunsky.config_y[5]));
-  compiler.add_node(__float_as_uint(sunsky.config_y[6]),
-                    __float_as_uint(sunsky.config_y[7]),
-                    __float_as_uint(sunsky.config_y[8]),
-                    __float_as_uint(sunsky.config_z[0]));
-  compiler.add_node(__float_as_uint(sunsky.config_z[1]),
-                    __float_as_uint(sunsky.config_z[2]),
-                    __float_as_uint(sunsky.config_z[3]),
-                    __float_as_uint(sunsky.config_z[4]));
-  compiler.add_node(__float_as_uint(sunsky.config_z[5]),
-                    __float_as_uint(sunsky.config_z[6]),
-                    __float_as_uint(sunsky.config_z[7]),
-                    __float_as_uint(sunsky.config_z[8]));
+  /* nishita doesn't need this data */
+  if (type != NODE_SKY_NISHITA) {
+    compiler.add_node(__float_as_uint(sunsky.phi),
+                      __float_as_uint(sunsky.theta),
+                      __float_as_uint(sunsky.radiance_x),
+                      __float_as_uint(sunsky.radiance_y));
+    compiler.add_node(__float_as_uint(sunsky.radiance_z),
+                      __float_as_uint(sunsky.config_x[0]),
+                      __float_as_uint(sunsky.config_x[1]),
+                      __float_as_uint(sunsky.config_x[2]));
+    compiler.add_node(__float_as_uint(sunsky.config_x[3]),
+                      __float_as_uint(sunsky.config_x[4]),
+                      __float_as_uint(sunsky.config_x[5]),
+                      __float_as_uint(sunsky.config_x[6]));
+    compiler.add_node(__float_as_uint(sunsky.config_x[7]),
+                      __float_as_uint(sunsky.config_x[8]),
+                      __float_as_uint(sunsky.config_y[0]),
+                      __float_as_uint(sunsky.config_y[1]));
+    compiler.add_node(__float_as_uint(sunsky.config_y[2]),
+                      __float_as_uint(sunsky.config_y[3]),
+                      __float_as_uint(sunsky.config_y[4]),
+                      __float_as_uint(sunsky.config_y[5]));
+    compiler.add_node(__float_as_uint(sunsky.config_y[6]),
+                      __float_as_uint(sunsky.config_y[7]),
+                      __float_as_uint(sunsky.config_y[8]),
+                      __float_as_uint(sunsky.config_z[0]));
+    compiler.add_node(__float_as_uint(sunsky.config_z[1]),
+                      __float_as_uint(sunsky.config_z[2]),
+                      __float_as_uint(sunsky.config_z[3]),
+                      __float_as_uint(sunsky.config_z[4]));
+    compiler.add_node(__float_as_uint(sunsky.config_z[5]),
+                      __float_as_uint(sunsky.config_z[6]),
+                      __float_as_uint(sunsky.config_z[7]),
+                      __float_as_uint(sunsky.config_z[8]));
+  }
+  else {
+    compiler.add_node(__float_as_uint(sunsky.nishita_data[0]),
+                      __float_as_uint(sunsky.nishita_data[1]),
+                      __float_as_uint(sunsky.nishita_data[2]),
+                      __float_as_uint(sunsky.nishita_data[3]));
+    compiler.add_node(__float_as_uint(sunsky.nishita_data[4]),
+                      __float_as_uint(sunsky.nishita_data[5]),
+                      __float_as_uint(sunsky.nishita_data[6]),
+                      __float_as_uint(sunsky.nishita_data[7]));
+    compiler.add_node(__float_as_uint(sunsky.nishita_data[8]), handle.svm_slot(), 0, 0);
+  }
 
   tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
@@ -828,10 +902,32 @@ void SkyTextureNode::compile(OSLCompiler &compiler)
   tex_mapping.compile(compiler);
 
   SunSky sunsky;
-  if (type == NODE_SKY_OLD)
-    sky_texture_precompute_old(&sunsky, sun_direction, turbidity);
-  else if (type == NODE_SKY_NEW)
-    sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo);
+  if (type == NODE_SKY_PREETHAM)
+    sky_texture_precompute_preetham(&sunsky, sun_direction, turbidity);
+  else if (type == NODE_SKY_HOSEK)
+    sky_texture_precompute_hosek(&sunsky, sun_direction, turbidity, ground_albedo);
+  else if (type == NODE_SKY_NISHITA) {
+    sky_texture_precompute_nishita(&sunsky,
+                                   sun_disc,
+                                   sun_size,
+                                   sun_elevation,
+                                   sun_rotation,
+                                   altitude,
+                                   air_density,
+                                   dust_density);
+    /* precomputed texture image parameters */
+    ImageManager *image_manager = compiler.scene->image_manager;
+    ImageParams impar;
+    impar.interpolation = INTERPOLATION_LINEAR;
+    impar.extension = EXTENSION_EXTEND;
+
+    /* precompute sky texture */
+    if (handle.empty()) {
+      SkyLoader *loader = new SkyLoader(
+          sun_elevation, altitude, air_density, dust_density, ozone_density);
+      handle = image_manager->add_image(loader, impar);
+    }
+  }
   else
     assert(false);
 
@@ -843,6 +939,11 @@ void SkyTextureNode::compile(OSLCompiler &compiler)
   compiler.parameter_array("config_x", sunsky.config_x, 9);
   compiler.parameter_array("config_y", sunsky.config_y, 9);
   compiler.parameter_array("config_z", sunsky.config_z, 9);
+  compiler.parameter_array("nishita_data", sunsky.nishita_data, 9);
+  /* nishita texture */
+  if (type == NODE_SKY_NISHITA) {
+    compiler.parameter_texture("filename", handle.svm_slot());
+  }
   compiler.add(this, "node_sky_texture");
 }
 
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 83c3ad071ae..846ba7423e5 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -168,7 +168,16 @@ class SkyTextureNode : public TextureNode {
   float3 sun_direction;
   float turbidity;
   float ground_albedo;
+  bool sun_disc;
+  float sun_size;
+  float sun_elevation;
+  float sun_rotation;
+  int altitude;
+  float air_density;
+  float dust_density;
+  float ozone_density;
   float3 vector;
+  ImageHandle handle;
 };
 
 class OutputNode : public ShaderNode {
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 61deef4cd76..c45ae5553a8 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -31,6 +31,7 @@
 #include "util/util_murmurhash.h"
 #include "util/util_progress.h"
 #include "util/util_set.h"
+#include "util/util_task.h"
 #include "util/util_vector.h"
 
 #include "subd/subd_patch_table.h"
@@ -77,7 +78,6 @@ struct UpdateObjectTransformState {
   Scene *scene;
 
   /* Some locks to keep everything thread-safe. */
-  thread_spin_lock queue_lock;
   thread_spin_lock surface_area_lock;
 
   /* First unused object index in the queue. */
@@ -219,7 +219,6 @@ void Object::tag_update(Scene *scene)
   }
 
   scene->camera->need_flags_update = true;
-  scene->curve_system_manager->need_update = true;
   scene->geometry_manager->need_update = true;
   scene->object_manager->need_update = true;
 }
@@ -550,41 +549,6 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
   }
 }
 
-bool ObjectManager::device_update_object_transform_pop_work(UpdateObjectTransformState *state,
-                                                            int *start_index,
-                                                            int *num_objects)
-{
-  /* Tweakable parameter, number of objects per chunk.
-   * Too small value will cause some extra overhead due to spin lock,
-   * too big value might not use all threads nicely.
-   */
-  static const int OBJECTS_PER_TASK = 32;
-  bool have_work = false;
-  state->queue_lock.lock();
-  int num_scene_objects = state->scene->objects.size();
-  if (state->queue_start_object < num_scene_objects) {
-    int count = min(OBJECTS_PER_TASK, num_scene_objects - state->queue_start_object);
-    *start_index = state->queue_start_object;
-    *num_objects = count;
-    state->queue_start_object += count;
-    have_work = true;
-  }
-  state->queue_lock.unlock();
-  return have_work;
-}
-
-void ObjectManager::device_update_object_transform_task(UpdateObjectTransformState *state)
-{
-  int start_index, num_objects;
-  while (device_update_object_transform_pop_work(state, &start_index, &num_objects)) {
-    for (int i = 0; i < num_objects; ++i) {
-      const int object_index = start_index + i;
-      Object *ob = state->scene->objects[object_index];
-      device_update_object_transform(state, ob);
-    }
-  }
-}
-
 void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, Progress &progress)
 {
   UpdateObjectTransformState state;
@@ -630,28 +594,19 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
     numparticles += psys->particles.size();
   }
 
-  /* NOTE: If it's just a handful of objects we deal with them in a single
-   * thread to avoid threading overhead. However, this threshold is might
-   * need some tweaks to make mid-complex scenes optimal.
-   */
-  if (scene->objects.size() < 64) {
-    foreach (Object *ob, scene->objects) {
-      device_update_object_transform(&state, ob);
-      if (progress.get_cancel()) {
-        return;
-      }
-    }
-  }
-  else {
-    const int num_threads = TaskScheduler::num_threads();
-    TaskPool pool;
-    for (int i = 0; i < num_threads; ++i) {
-      pool.push(function_bind(&ObjectManager::device_update_object_transform_task, this, &state));
-    }
-    pool.wait_work();
-    if (progress.get_cancel()) {
-      return;
-    }
+  /* Parallel object update, with grain size to avoid too much threading overhead
+   * for individual objects. */
+  static const int OBJECTS_PER_TASK = 32;
+  parallel_for(blocked_range<size_t>(0, scene->objects.size(), OBJECTS_PER_TASK),
+               [&](const blocked_range<size_t> &r) {
+                 for (size_t i = r.begin(); i != r.end(); i++) {
+                   Object *ob = state.scene->objects[i];
+                   device_update_object_transform(&state, ob);
+                 }
+               });
+
+  if (progress.get_cancel()) {
+    return;
   }
 
   dscene->objects.copy_to_device();
@@ -664,7 +619,6 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
 
   dscene->data.bvh.have_motion = state.have_motion;
   dscene->data.bvh.have_curves = state.have_curves;
-  dscene->data.bvh.have_instancing = true;
 }
 
 void ObjectManager::device_update(Device *device,
@@ -839,7 +793,6 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P
   bool motion_blur = need_motion == Scene::MOTION_BLUR;
   bool apply_to_motion = need_motion != Scene::MOTION_PASS;
   int i = 0;
-  bool have_instancing = false;
 
   foreach (Object *object, scene->objects) {
     map<Geometry *, int>::iterator it = geometry_users.find(object->geometry);
@@ -885,22 +838,15 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P
         if (geom->transform_negative_scaled)
           object_flag[i] |= SD_OBJECT_NEGATIVE_SCALE_APPLIED;
       }
-      else
-        have_instancing = true;
     }
-    else
-      have_instancing = true;
 
     i++;
   }
-
-  dscene->data.bvh.have_instancing = have_instancing;
 }
 
 void ObjectManager::tag_update(Scene *scene)
 {
   need_update = true;
-  scene->curve_system_manager->need_update = true;
   scene->geometry_manager->need_update = true;
   scene->light_manager->need_update = true;
 }
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index f5b68d5a4fe..9016a8d325f 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -108,7 +108,6 @@ Scene::Scene(const SceneParams &params_, Device *device)
   integrator = new Integrator();
   image_manager = new ImageManager(device->info);
   particle_system_manager = new ParticleSystemManager();
-  curve_system_manager = new CurveSystemManager();
   bake_manager = new BakeManager();
 
   /* OSL only works on the CPU */
@@ -156,7 +155,6 @@ void Scene::free_memory(bool final)
     light_manager->device_free(device, &dscene);
 
     particle_system_manager->device_free(device, &dscene);
-    curve_system_manager->device_free(device, &dscene);
 
     bake_manager->device_free(device, &dscene);
 
@@ -180,7 +178,6 @@ void Scene::free_memory(bool final)
     delete shader_manager;
     delete light_manager;
     delete particle_system_manager;
-    delete curve_system_manager;
     delete image_manager;
     delete bake_manager;
   }
@@ -233,12 +230,6 @@ void Scene::device_update(Device *device_, Progress &progress)
   if (progress.get_cancel() || device->have_error())
     return;
 
-  progress.set_status("Updating Hair Systems");
-  curve_system_manager->device_update(device, &dscene, this, progress);
-
-  if (progress.get_cancel() || device->have_error())
-    return;
-
   progress.set_status("Updating Particle Systems");
   particle_system_manager->device_update(device, &dscene, this, progress);
 
@@ -369,8 +360,7 @@ bool Scene::need_data_update()
   return (background->need_update || image_manager->need_update || object_manager->need_update ||
           geometry_manager->need_update || light_manager->need_update ||
           lookup_tables->need_update || integrator->need_update || shader_manager->need_update ||
-          particle_system_manager->need_update || curve_system_manager->need_update ||
-          bake_manager->need_update || film->need_update);
+          particle_system_manager->need_update || bake_manager->need_update || film->need_update);
 }
 
 bool Scene::need_reset()
@@ -393,7 +383,6 @@ void Scene::reset()
   geometry_manager->tag_update(this);
   light_manager->tag_update(this);
   particle_system_manager->tag_update(this);
-  curve_system_manager->tag_update(this);
 }
 
 void Scene::device_free()
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 6b10a901d7b..67616262c03 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -168,6 +168,8 @@ class SceneParams {
   bool use_bvh_spatial_split;
   bool use_bvh_unaligned_nodes;
   int num_bvh_time_steps;
+  int hair_subdivisions;
+  CurveShapeType hair_shape;
   bool persistent_data;
   int texture_limit;
 
@@ -181,6 +183,8 @@ class SceneParams {
     use_bvh_spatial_split = false;
     use_bvh_unaligned_nodes = true;
     num_bvh_time_steps = 0;
+    hair_subdivisions = 3;
+    hair_shape = CURVE_RIBBON;
     persistent_data = false;
     texture_limit = 0;
     background = true;
@@ -193,8 +197,15 @@ class SceneParams {
              use_bvh_spatial_split == params.use_bvh_spatial_split &&
              use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes &&
              num_bvh_time_steps == params.num_bvh_time_steps &&
+             hair_subdivisions == params.hair_subdivisions && hair_shape == params.hair_shape &&
              persistent_data == params.persistent_data && texture_limit == params.texture_limit);
   }
+
+  int curve_subdivisions()
+  {
+    /* Matching the tesselation rate limit in Embree. */
+    return clamp(1 << hair_subdivisions, 1, 16);
+  }
 };
 
 /* Scene */
@@ -226,7 +237,6 @@ class Scene {
   GeometryManager *geometry_manager;
   ObjectManager *object_manager;
   ParticleSystemManager *particle_system_manager;
-  CurveSystemManager *curve_system_manager;
   BakeManager *bake_manager;
 
   /* default shaders */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index f5bfebbaf78..1a94d3e9db7 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -61,8 +61,10 @@ Session::Session(const SessionParams &params_)
 
   TaskScheduler::init(params.threads);
 
+  /* Create CPU/GPU devices. */
   device = Device::create(params.device, stats, profiler, params.background);
 
+  /* Create buffers for interactive rendering. */
   if (params.background && !params.write_render_cb) {
     buffers = NULL;
     display = NULL;
@@ -72,6 +74,9 @@ Session::Session(const SessionParams &params_)
     display = new DisplayBuffer(device, params.display_buffer_linear);
   }
 
+  /* Validate denoising parameters. */
+  set_denoising(params.denoising);
+
   session_thread = NULL;
   scene = NULL;
 
@@ -773,6 +778,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
    */
   bool use_motion = scene->need_motion() == Scene::MotionType::MOTION_BLUR;
   requested_features.use_hair = false;
+  requested_features.use_hair_thick = (scene->params.hair_shape == CURVE_THICK);
   requested_features.use_object_motion = false;
   requested_features.use_camera_motion = use_motion && scene->camera->use_motion();
   foreach (Object *object, scene->objects) {
@@ -804,7 +810,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
   requested_features.use_baking = bake_manager->get_baking();
   requested_features.use_integrator_branched = (scene->integrator->method ==
                                                 Integrator::BRANCHED_PATH);
-  if (params.run_denoising) {
+  if (params.denoising.use || params.denoising.store_passes) {
     requested_features.use_denoising = true;
     requested_features.use_shadow_tricks = true;
   }
@@ -941,24 +947,35 @@ void Session::set_pause(bool pause_)
     pause_cond.notify_all();
 }
 
-void Session::set_denoising(bool denoising, bool optix_denoising)
+void Session::set_denoising(const DenoiseParams &denoising)
 {
+  bool need_denoise = denoising.need_denoising_task();
+
   /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
   thread_scoped_lock buffers_lock(buffers_mutex);
+  params.denoising = denoising;
+
+  if (!(params.device.denoisers & denoising.type)) {
+    if (need_denoise) {
+      progress.set_error("Denoiser type not supported by compute device");
+    }
 
-  params.run_denoising = denoising;
-  params.full_denoising = !optix_denoising;
-  params.optix_denoising = optix_denoising;
+    params.denoising.use = false;
+    need_denoise = false;
+  }
 
   // TODO(pmours): Query the required overlap value for denoising from the device?
-  tile_manager.slice_overlap = denoising && !params.background ? 64 : 0;
-  tile_manager.schedule_denoising = denoising && !buffers;
+  tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0;
+
+  /* Schedule per tile denoising for final renders if we are either denoising or
+   * need prefiltered passes for the native denoiser. */
+  tile_manager.schedule_denoising = need_denoise && !buffers;
 }
 
 void Session::set_denoising_start_sample(int sample)
 {
-  if (sample != params.denoising_start_sample) {
-    params.denoising_start_sample = sample;
+  if (sample != params.denoising.start_sample) {
+    params.denoising.start_sample = sample;
 
     pause_cond.notify_all();
   }
@@ -1078,10 +1095,10 @@ void Session::update_status_time(bool show_pause, bool show_done)
        */
       substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
     }
-    if (params.full_denoising || params.optix_denoising) {
+    if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) {
       substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
     }
-    else if (params.run_denoising) {
+    else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) {
       substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles());
     }
   }
@@ -1110,7 +1127,7 @@ bool Session::render_need_denoise(bool &delayed)
   delayed = false;
 
   /* Denoising enabled? */
-  if (!params.run_denoising) {
+  if (!params.denoising.need_denoising_task()) {
     return false;
   }
 
@@ -1127,7 +1144,7 @@ bool Session::render_need_denoise(bool &delayed)
   }
 
   /* Do not denoise until the sample at which denoising should start is reached. */
-  if (tile_manager.state.sample < params.denoising_start_sample) {
+  if (tile_manager.state.sample < min(params.denoising.start_sample, params.samples - 1)) {
     return false;
   }
 
@@ -1178,9 +1195,6 @@ void Session::render(bool need_denoise)
     task.pass_denoising_clean = scene->film->denoising_clean_offset;
 
     task.denoising_from_render = true;
-    task.denoising_do_filter = params.full_denoising;
-    task.denoising_use_optix = params.optix_denoising;
-    task.denoising_write_passes = params.write_denoising_passes;
 
     if (tile_manager.schedule_denoising) {
       /* Acquire denoising tiles during rendering. */
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 2707eed5531..0141629762c 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -62,10 +62,6 @@ class SessionParams {
 
   bool display_buffer_linear;
 
-  bool run_denoising;
-  bool write_denoising_passes;
-  bool full_denoising;
-  bool optix_denoising;
   DenoiseParams denoising;
 
   double cancel_timeout;
@@ -94,11 +90,6 @@ class SessionParams {
 
     use_profiling = false;
 
-    run_denoising = false;
-    write_denoising_passes = false;
-    full_denoising = false;
-    optix_denoising = false;
-
     display_buffer_linear = false;
 
     cancel_timeout = 0.1;
@@ -125,7 +116,8 @@ class SessionParams {
              cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
              text_timeout == params.text_timeout &&
              progressive_update_timeout == params.progressive_update_timeout &&
-             tile_order == params.tile_order && shadingsystem == params.shadingsystem);
+             tile_order == params.tile_order && shadingsystem == params.shadingsystem &&
+             denoising.type == params.denoising.type);
   }
 };
 
@@ -161,7 +153,7 @@ class Session {
   void reset(BufferParams &params, int samples);
   void set_pause(bool pause);
   void set_samples(int samples);
-  void set_denoising(bool denoising, bool optix_denoising);
+  void set_denoising(const DenoiseParams &denoising);
   void set_denoising_start_sample(int sample);
 
   bool update_scene();
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 39ba45a751a..1120d909e98 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -33,6 +33,7 @@
 
 #include "util/util_foreach.h"
 #include "util/util_murmurhash.h"
+#include "util/util_task.h"
 
 #ifdef WITH_OCIO
 #  include <OpenColorIO/OpenColorIO.h>
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index ea3dbaf8e03..88714e20a90 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -94,8 +94,7 @@ void SVMShaderManager::device_update(Device *device,
                                  scene,
                                  scene->shaders[i],
                                  &progress,
-                                 &shader_svm_nodes[i]),
-                   false);
+                                 &shader_svm_nodes[i]));
   }
   task_pool.wait_work();
 
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index 87389ebfb16..4ea3470cda8 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -17,11 +17,15 @@
 #include "testing/mock_log.h"
 #include "testing/testing.h"
 
+#include "device/device.h"
+
 #include "render/graph.h"
 #include "render/nodes.h"
 #include "render/scene.h"
+
 #include "util/util_array.h"
 #include "util/util_logging.h"
+#include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index c1f71461dfd..ad4ea9c86e0 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -29,7 +29,7 @@ set(SRC
 )
 
 set(LIB
-
+  ${TBB_LIBRARIES}
 )
 
 if(WITH_CYCLES_STANDALONE)
@@ -86,6 +86,7 @@ set(SRC_HEADERS
   util_math_matrix.h
   util_md5.h
   util_murmurhash.h
+  util_openimagedenoise.h
   util_opengl.h
   util_optimization.h
   util_param.h
@@ -100,6 +101,7 @@ set(SRC_HEADERS
   util_sky_model.cpp
   util_sky_model.h
   util_sky_model_data.h
+  util_sky_nishita.cpp
   util_avxf.h
   util_avxb.h
   util_semaphore.h
@@ -112,6 +114,7 @@ set(SRC_HEADERS
   util_string.h
   util_system.h
   util_task.h
+  util_tbb.h
   util_texture.h
   util_thread.h
   util_time.h
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 3ce65802cff..6ad4f709ab5 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -31,7 +31,7 @@ DebugFlags::CPU::CPU()
       sse41(true),
       sse3(true),
       sse2(true),
-      bvh_layout(BVH_LAYOUT_DEFAULT),
+      bvh_layout(BVH_LAYOUT_AUTO),
       split_kernel(false)
 {
   reset();
@@ -57,18 +57,7 @@ void DebugFlags::CPU::reset()
 #undef STRINGIFY
 #undef CHECK_CPU_FLAGS
 
-  if (getenv("CYCLES_BVH2") != NULL) {
-    bvh_layout = BVH_LAYOUT_BVH2;
-  }
-  else if (getenv("CYCLES_BVH4") != NULL) {
-    bvh_layout = BVH_LAYOUT_BVH4;
-  }
-  else if (getenv("CYCLES_BVH8") != NULL) {
-    bvh_layout = BVH_LAYOUT_BVH8;
-  }
-  else {
-    bvh_layout = BVH_LAYOUT_DEFAULT;
-  }
+  bvh_layout = BVH_LAYOUT_AUTO;
 
   split_kernel = false;
 }
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index cf6b442b878..da9f5408b59 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -73,10 +73,10 @@ class DebugFlags {
       return sse2;
     }
 
-    /* Requested BVH size.
+    /* Requested BVH layout.
      *
-     * Rendering will use widest possible BVH which is below or equal
-     * this one.
+     * By default the fastest will be used. For debugging the BVH used by other
+     * CPUs and GPUs can be selected here instead.
      */
     BVHLayout bvh_layout;
 
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index dbed83ab84d..e979bd9e0c0 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -446,6 +446,11 @@ ccl_device_inline float fast_expf(float x)
 }
 
 #ifndef __KERNEL_GPU__
+/* MSVC seems to have a code-gen bug here in at least SSE41/AVX
+ * see T78047 for details. */
+#  ifdef _MSC_VER
+#    pragma optimize("", off)
+#  endif
 ccl_device float4 fast_exp2f4(float4 x)
 {
   const float4 one = make_float4(1.0f);
@@ -461,6 +466,9 @@ ccl_device float4 fast_exp2f4(float4 x)
   r = madd4(x, r, make_float4(1.0f));
   return __int4_as_float4(__float4_as_int4(r) + (m << 23));
 }
+#  ifdef _MSC_VER
+#    pragma optimize("", on)
+#  endif
 
 ccl_device_inline float4 fast_expf4(float4 x)
 {
diff --git a/intern/cycles/util/util_openimagedenoise.h b/intern/cycles/util/util_openimagedenoise.h
new file mode 100644
index 00000000000..aafa69cb530
--- /dev/null
+++ b/intern/cycles/util/util_openimagedenoise.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_OPENIMAGEDENOISE_H__
+#define __UTIL_OPENIMAGEDENOISE_H__
+
+#ifdef WITH_OPENIMAGEDENOISE
+#  include <OpenImageDenoise/oidn.hpp>
+#endif
+
+#include "util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+static inline bool openimagedenoise_supported()
+{
+#ifdef WITH_OPENIMAGEDENOISE
+  return system_cpu_support_sse41();
+#else
+  return false;
+#endif
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_OPENIMAGEDENOISE_H__ */
diff --git a/intern/cycles/util/util_sky_model.h b/intern/cycles/util/util_sky_model.h
index 84340614b2c..36f1079a16d 100644
--- a/intern/cycles/util/util_sky_model.h
+++ b/intern/cycles/util/util_sky_model.h
@@ -298,6 +298,8 @@ HINT #1:   if you want to model the sky of an earth-like planet that orbits
            previous paragraph.
 */
 
+#include "util/util_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef _SKY_MODEL_H_
@@ -426,4 +428,26 @@ double arhosekskymodel_solar_radiance(ArHosekSkyModelState *state,
 
 #endif  // _SKY_MODEL_H_
 
+/* Nishita improved sky model */
+
+void nishita_skymodel_precompute_texture(float *pixels,
+                                         int stride,
+                                         int start_y,
+                                         int end_y,
+                                         int width,
+                                         int height,
+                                         float sun_elevation,
+                                         float altitude,
+                                         float air_density,
+                                         float dust_density,
+                                         float ozone_density);
+
+void nishita_skymodel_precompute_sun(float sun_elevation,
+                                     float angular_diameter,
+                                     float altitude,
+                                     float air_density,
+                                     float dust_density,
+                                     float *pixel_bottom,
+                                     float *pixel_top);
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_sky_nishita.cpp b/intern/cycles/util/util_sky_nishita.cpp
new file mode 100644
index 00000000000..92397804d43
--- /dev/null
+++ b/intern/cycles/util/util_sky_nishita.cpp
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_math.h"
+#include "util/util_sky_model.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Constants */
+static const float rayleigh_scale = 8000.0f;        // Rayleigh scale height (m)
+static const float mie_scale = 1200.0f;             // Mie scale height (m)
+static const float mie_coeff = 2e-5f;               // Mie scattering coefficient
+static const float mie_G = 0.76f;                   // aerosols anisotropy
+static const float earth_radius = 6360000.0f;       // radius of Earth (m)
+static const float atmosphere_radius = 6420000.0f;  // radius of atmosphere (m)
+static const int steps = 32;                        // segments per primary ray
+static const int steps_light = 16;                  // segments per sun connection ray
+static const int num_wavelengths = 21;              // number of wavelengths
+/* irradiance at top of atmosphere */
+static const float irradiance[] = {
+    1.45756829855592995315f, 1.56596305559738380175f, 1.65148449067670455293f,
+    1.71496242737209314555f, 1.75797983805020541226f, 1.78256407885924539336f,
+    1.79095108475838560302f, 1.78541550133410664714f, 1.76815554864306845317f,
+    1.74122069647250410362f, 1.70647127164943679389f, 1.66556087452739887134f,
+    1.61993437242451854274f, 1.57083597368892080581f, 1.51932335059305478886f,
+    1.46628494965214395407f, 1.41245852740172450623f, 1.35844961970384092709f,
+    1.30474913844739281998f, 1.25174963272610817455f, 1.19975998755420620867f};
+/* Rayleigh scattering coefficient */
+static const float rayleigh_coeff[] = {
+    0.00005424820087636473f, 0.00004418549866505454f, 0.00003635151910165377f,
+    0.00003017929012024763f, 0.00002526320226989157f, 0.00002130859310621843f,
+    0.00001809838025320633f, 0.00001547057129129042f, 0.00001330284977336850f,
+    0.00001150184784075764f, 0.00000999557429990163f, 0.00000872799973630707f,
+    0.00000765513700977967f, 0.00000674217203751443f, 0.00000596134125832052f,
+    0.00000529034598065810f, 0.00000471115687557433f, 0.00000420910481110487f,
+    0.00000377218381260133f, 0.00000339051255477280f, 0.00000305591531679811f};
+/* Ozone absorption coefficient */
+static const float ozone_coeff[] = {
+    0.00000000325126849861f, 0.00000000585395365047f, 0.00000001977191155085f,
+    0.00000007309568762914f, 0.00000020084561514287f, 0.00000040383958096161f,
+    0.00000063551335912363f, 0.00000096707041180970f, 0.00000154797400424410f,
+    0.00000209038647223331f, 0.00000246128056164565f, 0.00000273551299461512f,
+    0.00000215125863128643f, 0.00000159051840791988f, 0.00000112356197979857f,
+    0.00000073527551487574f, 0.00000046450130357806f, 0.00000033096079921048f,
+    0.00000022512612292678f, 0.00000014879129266490f, 0.00000016828623364192f};
+/* CIE XYZ color matching functions */
+static const float cmf_xyz[][3] = {{0.00136800000f, 0.00003900000f, 0.00645000100f},
+                                   {0.01431000000f, 0.00039600000f, 0.06785001000f},
+                                   {0.13438000000f, 0.00400000000f, 0.64560000000f},
+                                   {0.34828000000f, 0.02300000000f, 1.74706000000f},
+                                   {0.29080000000f, 0.06000000000f, 1.66920000000f},
+                                   {0.09564000000f, 0.13902000000f, 0.81295010000f},
+                                   {0.00490000000f, 0.32300000000f, 0.27200000000f},
+                                   {0.06327000000f, 0.71000000000f, 0.07824999000f},
+                                   {0.29040000000f, 0.95400000000f, 0.02030000000f},
+                                   {0.59450000000f, 0.99500000000f, 0.00390000000f},
+                                   {0.91630000000f, 0.87000000000f, 0.00165000100f},
+                                   {1.06220000000f, 0.63100000000f, 0.00080000000f},
+                                   {0.85444990000f, 0.38100000000f, 0.00019000000f},
+                                   {0.44790000000f, 0.17500000000f, 0.00002000000f},
+                                   {0.16490000000f, 0.06100000000f, 0.00000000000f},
+                                   {0.04677000000f, 0.01700000000f, 0.00000000000f},
+                                   {0.01135916000f, 0.00410200000f, 0.00000000000f},
+                                   {0.00289932700f, 0.00104700000f, 0.00000000000f},
+                                   {0.00069007860f, 0.00024920000f, 0.00000000000f},
+                                   {0.00016615050f, 0.00006000000f, 0.00000000000f},
+                                   {0.00004150994f, 0.00001499000f, 0.00000000000f}};
+
+static float3 geographical_to_direction(float lat, float lon)
+{
+  return make_float3(cosf(lat) * cosf(lon), cosf(lat) * sinf(lon), sinf(lat));
+}
+
+static float3 spec_to_xyz(float *spectrum)
+{
+  float3 xyz = make_float3(0.0f, 0.0f, 0.0f);
+  for (int i = 0; i < num_wavelengths; i++) {
+    xyz.x += cmf_xyz[i][0] * spectrum[i];
+    xyz.y += cmf_xyz[i][1] * spectrum[i];
+    xyz.z += cmf_xyz[i][2] * spectrum[i];
+  }
+  return xyz * (20 * 683 * 1e-9f);
+}
+
+/* Atmosphere volume models */
+
+static float density_rayleigh(float height)
+{
+  return expf(-height / rayleigh_scale);
+}
+
+static float density_mie(float height)
+{
+  return expf(-height / mie_scale);
+}
+
+static float density_ozone(float height)
+{
+  float den = 0.0f;
+  if (height >= 10000.0f && height < 25000.0f)
+    den = 1.0f / 15000.0f * height - 2.0f / 3.0f;
+  else if (height >= 25000 && height < 40000)
+    den = -(1.0f / 15000.0f * height - 8.0f / 3.0f);
+  return den;
+}
+
+static float phase_rayleigh(float mu)
+{
+  return 3.0f / (16.0f * M_PI_F) * (1.0f + sqr(mu));
+}
+
+static float phase_mie(float mu)
+{
+  static const float sqr_G = mie_G * mie_G;
+
+  return (3.0f * (1.0f - sqr_G) * (1.0f + sqr(mu))) /
+         (8.0f * M_PI_F * (2.0f + sqr_G) * powf((1.0f + sqr_G - 2.0f * mie_G * mu), 1.5));
+}
+
+/* Intersection helpers */
+static bool surface_intersection(float3 pos, float3 dir)
+{
+  if (dir.z >= 0)
+    return false;
+  float t = dot(dir, -pos) / len_squared(dir);
+  float D = pos.x * pos.x - 2.0f * (-pos.x) * dir.x * t + dir.x * t * dir.x * t + pos.y * pos.y -
+            2.0f * (-pos.y) * dir.y * t + (dir.y * t) * (dir.y * t) + pos.z * pos.z -
+            2.0f * (-pos.z) * dir.z * t + dir.z * t * dir.z * t;
+  return (D <= sqr(earth_radius));
+}
+
+static float3 atmosphere_intersection(float3 pos, float3 dir)
+{
+  float b = -2.0f * dot(dir, -pos);
+  float c = len_squared(pos) - sqr(atmosphere_radius);
+  float t = (-b + sqrtf(b * b - 4.0f * c)) / 2.0f;
+  return make_float3(pos.x + dir.x * t, pos.y + dir.y * t, pos.z + dir.z * t);
+}
+
+static float3 ray_optical_depth(float3 ray_origin, float3 ray_dir)
+{
+  /* This code computes the optical depth along a ray through the atmosphere. */
+  float3 ray_end = atmosphere_intersection(ray_origin, ray_dir);
+  float ray_length = distance(ray_origin, ray_end);
+
+  /* To compute the optical depth, we step along the ray in segments and
+   * accumulate the optical depth along each segment. */
+  float segment_length = ray_length / steps_light;
+  float3 segment = segment_length * ray_dir;
+
+  /* Instead of tracking the transmission spectrum across all wavelengths directly,
+   * we use the fact that the density always has the same spectrum for each type of
+   * scattering, so we split the density into a constant spectrum and a factor and
+   * only track the factors. */
+  float3 optical_depth = make_float3(0.0f, 0.0f, 0.0f);
+
+  /* The density of each segment is evaluated at its middle. */
+  float3 P = ray_origin + 0.5f * segment;
+  for (int i = 0; i < steps_light; i++) {
+    /* Compute height above sea level. */
+    float height = len(P) - earth_radius;
+
+    /* Accumulate optical depth of this segment (density is assumed to be constant along it). */
+    float3 density = make_float3(
+        density_rayleigh(height), density_mie(height), density_ozone(height));
+    optical_depth += segment_length * density;
+
+    /* Advance along ray. */
+    P += segment;
+  }
+
+  return optical_depth;
+}
+
+/* Single Scattering implementation */
+static void single_scattering(float3 ray_dir,
+                              float3 sun_dir,
+                              float3 ray_origin,
+                              float air_density,
+                              float dust_density,
+                              float ozone_density,
+                              float *r_spectrum)
+{
+  /* This code computes single-inscattering along a ray through the atmosphere. */
+  float3 ray_end = atmosphere_intersection(ray_origin, ray_dir);
+  float ray_length = distance(ray_origin, ray_end);
+
+  /* To compute the inscattering, we step along the ray in segments and accumulate
+   * the inscattering as well as the optical depth along each segment. */
+  float segment_length = ray_length / steps;
+  float3 segment = segment_length * ray_dir;
+
+  /* Instead of tracking the transmission spectrum across all wavelengths directly,
+   * we use the fact that the density always has the same spectrum for each type of
+   * scattering, so we split the density into a constant spectrum and a factor and
+   * only track the factors. */
+  float3 optical_depth = make_float3(0.0f, 0.0f, 0.0f);
+
+  /* Zero out light accumulation. */
+  for (int wl = 0; wl < num_wavelengths; wl++) {
+    r_spectrum[wl] = 0.0f;
+  }
+
+  /* Compute phase function for scattering and the density scale factor. */
+  float mu = dot(ray_dir, sun_dir);
+  float3 phase_function = make_float3(phase_rayleigh(mu), phase_mie(mu), 0.0f);
+  float3 density_scale = make_float3(air_density, dust_density, ozone_density);
+
+  /* The density and in-scattering of each segment is evaluated at its middle. */
+  float3 P = ray_origin + 0.5f * segment;
+  for (int i = 0; i < steps; i++) {
+    /* Compute height above sea level. */
+    float height = len(P) - earth_radius;
+
+    /* Evaluate and accumulate optical depth along the ray. */
+    float3 density = density_scale * make_float3(density_rayleigh(height),
+                                                 density_mie(height),
+                                                 density_ozone(height));
+    optical_depth += segment_length * density;
+
+    /* If the earth isn't in the way, evaluate inscattering from the sun. */
+    if (!surface_intersection(P, sun_dir)) {
+      float3 light_optical_depth = density_scale * ray_optical_depth(P, sun_dir);
+      float3 total_optical_depth = optical_depth + light_optical_depth;
+
+      /* attenuation of light */
+      for (int wl = 0; wl < num_wavelengths; wl++) {
+        float3 extinction_density = total_optical_depth * make_float3(rayleigh_coeff[wl],
+                                                                      1.11f * mie_coeff,
+                                                                      ozone_coeff[wl]);
+        float attenuation = expf(-reduce_add(extinction_density));
+
+        float3 scattering_density = density * make_float3(rayleigh_coeff[wl], mie_coeff, 0.0f);
+
+        /* The total inscattered radiance from one segment is:
+         * Tr(A<->B) * Tr(B<->C) * sigma_s * phase * L * segment_length
+         *
+         * These terms are:
+         * Tr(A<->B): Transmission from start to scattering position (tracked in optical_depth)
+         * Tr(B<->C): Transmission from scattering position to light (computed in
+         * ray_optical_depth) sigma_s: Scattering density phase: Phase function of the scattering
+         * type (Rayleigh or Mie) L: Radiance coming from the light source segment_length: The
+         * length of the segment
+         *
+         * The code here is just that, with a bit of additional optimization to not store full
+         * spectra for the optical depth.
+         */
+        r_spectrum[wl] += attenuation * reduce_add(phase_function * scattering_density) *
+                          irradiance[wl] * segment_length;
+      }
+    }
+
+    /* Advance along ray. */
+    P += segment;
+  }
+}
+
+/* calculate texture array */
+void nishita_skymodel_precompute_texture(float *pixels,
+                                         int stride,
+                                         int start_y,
+                                         int end_y,
+                                         int width,
+                                         int height,
+                                         float sun_elevation,
+                                         float altitude,
+                                         float air_density,
+                                         float dust_density,
+                                         float ozone_density)
+{
+  /* calculate texture pixels */
+  float spectrum[num_wavelengths];
+  int half_width = width / 2;
+  float3 cam_pos = make_float3(0, 0, earth_radius + altitude);
+  float3 sun_dir = geographical_to_direction(sun_elevation, 0.0f);
+
+  float latitude_step = M_PI_2_F / height;
+  float longitude_step = M_2PI_F / width;
+
+  for (int y = start_y; y < end_y; y++) {
+    float latitude = latitude_step * y;
+
+    float *pixel_row = pixels + (y * width) * stride;
+    for (int x = 0; x < half_width; x++) {
+      float longitude = longitude_step * x - M_PI_F;
+
+      float3 dir = geographical_to_direction(latitude, longitude);
+      single_scattering(dir, sun_dir, cam_pos, air_density, dust_density, ozone_density, spectrum);
+      float3 xyz = spec_to_xyz(spectrum);
+
+      pixel_row[x * stride + 0] = xyz.x;
+      pixel_row[x * stride + 1] = xyz.y;
+      pixel_row[x * stride + 2] = xyz.z;
+      int mirror_x = width - x - 1;
+      pixel_row[mirror_x * stride + 0] = xyz.x;
+      pixel_row[mirror_x * stride + 1] = xyz.y;
+      pixel_row[mirror_x * stride + 2] = xyz.z;
+    }
+  }
+}
+
+/* Sun disc */
+static void sun_radiation(float3 cam_dir,
+                          float altitude,
+                          float air_density,
+                          float dust_density,
+                          float solid_angle,
+                          float *r_spectrum)
+{
+  float3 cam_pos = make_float3(0, 0, earth_radius + altitude);
+  float3 optical_depth = ray_optical_depth(cam_pos, cam_dir);
+
+  /* Compute final spectrum. */
+  for (int i = 0; i < num_wavelengths; i++) {
+    /* Combine spectra and the optical depth into transmittance. */
+    float transmittance = rayleigh_coeff[i] * optical_depth.x * air_density +
+                          1.11f * mie_coeff * optical_depth.y * dust_density;
+    r_spectrum[i] = (irradiance[i] / solid_angle) * expf(-transmittance);
+  }
+}
+
+void nishita_skymodel_precompute_sun(float sun_elevation,
+                                     float angular_diameter,
+                                     float altitude,
+                                     float air_density,
+                                     float dust_density,
+                                     float *pixel_bottom,
+                                     float *pixel_top)
+{
+  /* definitions */
+  float half_angular = angular_diameter / 2.0f;
+  float solid_angle = M_2PI_F * (1.0f - cosf(half_angular));
+  float spectrum[num_wavelengths];
+  float bottom = sun_elevation - half_angular;
+  float top = sun_elevation + half_angular;
+  float elevation_bottom, elevation_top;
+  float3 pix_bottom, pix_top, sun_dir;
+
+  /* compute 2 pixels for sun disc */
+  elevation_bottom = (bottom > 0.0f) ? bottom : 0.0f;
+  elevation_top = (top > 0.0f) ? top : 0.0f;
+  sun_dir = geographical_to_direction(elevation_bottom, 0.0f);
+  sun_radiation(sun_dir, altitude, air_density, dust_density, solid_angle, spectrum);
+  pix_bottom = spec_to_xyz(spectrum);
+  sun_dir = geographical_to_direction(elevation_top, 0.0f);
+  sun_radiation(sun_dir, altitude, air_density, dust_density, solid_angle, spectrum);
+  pix_top = spec_to_xyz(spectrum);
+
+  /* store pixels */
+  pixel_bottom[0] = pix_bottom.x;
+  pixel_bottom[1] = pix_bottom.y;
+  pixel_bottom[2] = pix_bottom.z;
+  pixel_top[0] = pix_top.x;
+  pixel_top[1] = pix_top.y;
+  pixel_top[2] = pix_top.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 61aa28c6815..4fb61392e92 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -20,100 +20,28 @@
 #include "util/util_system.h"
 #include "util/util_time.h"
 
-//#define THREADING_DEBUG_ENABLED
-
-#ifdef THREADING_DEBUG_ENABLED
-#  include <stdio.h>
-#  define THREADING_DEBUG(...) \
-    do { \
-      printf(__VA_ARGS__); \
-      fflush(stdout); \
-    } while (0)
-#else
-#  define THREADING_DEBUG(...)
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 /* Task Pool */
 
-TaskPool::TaskPool()
+TaskPool::TaskPool() : start_time(time_dt()), num_tasks_handled(0)
 {
-  num_tasks_handled = 0;
-  num = 0;
-  do_cancel = false;
 }
 
 TaskPool::~TaskPool()
 {
-  stop();
+  cancel();
 }
 
-void TaskPool::push(Task *task, bool front)
+void TaskPool::push(TaskRunFunction &&task)
 {
-  TaskScheduler::Entry entry;
-
-  entry.task = task;
-  entry.pool = this;
-
-  TaskScheduler::push(entry, front);
-}
-
-void TaskPool::push(const TaskRunFunction &run, bool front)
-{
-  push(new Task(run), front);
+  tbb_group.run(std::move(task));
+  num_tasks_handled++;
 }
 
 void TaskPool::wait_work(Summary *stats)
 {
-  thread_scoped_lock num_lock(num_mutex);
-
-  while (num != 0) {
-    num_lock.unlock();
-
-    thread_scoped_lock queue_lock(TaskScheduler::queue_mutex);
-
-    /* find task from this pool. if we get a task from another pool,
-     * we can get into deadlock */
-    TaskScheduler::Entry work_entry;
-    bool found_entry = false;
-    list<TaskScheduler::Entry>::iterator it;
-
-    for (it = TaskScheduler::queue.begin(); it != TaskScheduler::queue.end(); it++) {
-      TaskScheduler::Entry &entry = *it;
-
-      if (entry.pool == this) {
-        work_entry = entry;
-        found_entry = true;
-        TaskScheduler::queue.erase(it);
-        break;
-      }
-    }
-
-    queue_lock.unlock();
-
-    /* if found task, do it, otherwise wait until other tasks are done */
-    if (found_entry) {
-      /* run task */
-      work_entry.task->run(0);
-
-      /* delete task */
-      delete work_entry.task;
-
-      /* notify pool task was done */
-      num_decrease(1);
-    }
-
-    num_lock.lock();
-    if (num == 0)
-      break;
-
-    if (!found_entry) {
-      THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::wait_work !found_entry\n", num);
-      num_cond.wait(num_lock);
-      THREADING_DEBUG("num==%d, condition wait done in TaskPool::wait_work !found_entry\n", num);
-    }
-  }
+  tbb_group.wait();
 
   if (stats != NULL) {
     stats->time_total = time_dt() - start_time;
@@ -123,193 +51,21 @@ void TaskPool::wait_work(Summary *stats)
 
 void TaskPool::cancel()
 {
-  do_cancel = true;
-
-  TaskScheduler::clear(this);
-
-  {
-    thread_scoped_lock num_lock(num_mutex);
-
-    while (num) {
-      THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::cancel\n", num);
-      num_cond.wait(num_lock);
-      THREADING_DEBUG("num==%d condition wait done in TaskPool::cancel\n", num);
-    }
-  }
-
-  do_cancel = false;
-}
-
-void TaskPool::stop()
-{
-  TaskScheduler::clear(this);
-
-  assert(num == 0);
+  tbb_group.cancel();
+  tbb_group.wait();
 }
 
 bool TaskPool::canceled()
 {
-  return do_cancel;
-}
-
-bool TaskPool::finished()
-{
-  thread_scoped_lock num_lock(num_mutex);
-  return num == 0;
-}
-
-void TaskPool::num_decrease(int done)
-{
-  num_mutex.lock();
-  num -= done;
-
-  assert(num >= 0);
-  if (num == 0) {
-    THREADING_DEBUG("num==%d, notifying all in TaskPool::num_decrease\n", num);
-    num_cond.notify_all();
-  }
-
-  num_mutex.unlock();
-}
-
-void TaskPool::num_increase()
-{
-  thread_scoped_lock num_lock(num_mutex);
-  if (num_tasks_handled == 0) {
-    start_time = time_dt();
-  }
-  num++;
-  num_tasks_handled++;
-  THREADING_DEBUG("num==%d, notifying all in TaskPool::num_increase\n", num);
-  num_cond.notify_all();
+  return tbb_group.is_canceling();
 }
 
 /* Task Scheduler */
 
 thread_mutex TaskScheduler::mutex;
 int TaskScheduler::users = 0;
-vector<thread *> TaskScheduler::threads;
-bool TaskScheduler::do_exit = false;
-
-list<TaskScheduler::Entry> TaskScheduler::queue;
-thread_mutex TaskScheduler::queue_mutex;
-thread_condition_variable TaskScheduler::queue_cond;
-
-namespace {
-
-/* Get number of processors on each of the available nodes. The result is sized
- * by the highest node index, and element corresponds to number of processors on
- * that node.
- * If node is not available, then the corresponding number of processors is
- * zero. */
-void get_per_node_num_processors(vector<int> *num_per_node_processors)
-{
-  const int num_nodes = system_cpu_num_numa_nodes();
-  if (num_nodes == 0) {
-    LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen.";
-    return;
-  }
-  num_per_node_processors->resize(num_nodes);
-  for (int node = 0; node < num_nodes; ++node) {
-    if (!system_cpu_is_numa_node_available(node)) {
-      (*num_per_node_processors)[node] = 0;
-      continue;
-    }
-    (*num_per_node_processors)[node] = system_cpu_num_numa_node_processors(node);
-  }
-}
-
-/* Calculate total number of processors on all available nodes.
- * This is similar to system_cpu_thread_count(), but uses pre-calculated number
- * of processors on each of the node, avoiding extra system calls and checks for
- * the node availability. */
-int get_num_total_processors(const vector<int> &num_per_node_processors)
-{
-  int num_total_processors = 0;
-  foreach (int num_node_processors, num_per_node_processors) {
-    num_total_processors += num_node_processors;
-  }
-  return num_total_processors;
-}
-
-/* Compute NUMA node for every thread to run on, for the best performance. */
-vector<int> distribute_threads_on_nodes(const int num_threads)
-{
-  /* Start with all threads unassigned to any specific NUMA node. */
-  vector<int> thread_nodes(num_threads, -1);
-  const int num_active_group_processors = system_cpu_num_active_group_processors();
-  VLOG(1) << "Detected " << num_active_group_processors << " processors "
-          << "in active group.";
-  if (num_active_group_processors >= num_threads) {
-    /* If the current thread is set up in a way that its affinity allows to
-     * use at least requested number of threads we do not explicitly set
-     * affinity to the worker threads.
-     * This way we allow users to manually edit affinity of the parent
-     * thread, and here we follow that affinity. This way it's possible to
-     * have two Cycles/Blender instances running manually set to a different
-     * dies on a CPU. */
-    VLOG(1) << "Not setting thread group affinity.";
-    return thread_nodes;
-  }
-  vector<int> num_per_node_processors;
-  get_per_node_num_processors(&num_per_node_processors);
-  if (num_per_node_processors.size() == 0) {
-    /* Error was already reported, here we can't do anything, so we simply
-     * leave default affinity to all the worker threads. */
-    return thread_nodes;
-  }
-  const int num_nodes = num_per_node_processors.size();
-  int thread_index = 0;
-  /* First pass: fill in all the nodes to their maximum.
-   *
-   * If there is less threads than the overall nodes capacity, some of the
-   * nodes or parts of them will idle.
-   *
-   * TODO(sergey): Consider picking up fastest nodes if number of threads
-   * fits on them. For example, on Threadripper2 we might consider using nodes
-   * 0 and 2 if user requested 32 render threads. */
-  const int num_total_node_processors = get_num_total_processors(num_per_node_processors);
-  int current_node_index = 0;
-  while (thread_index < num_total_node_processors && thread_index < num_threads) {
-    const int num_node_processors = num_per_node_processors[current_node_index];
-    for (int processor_index = 0; processor_index < num_node_processors; ++processor_index) {
-      VLOG(1) << "Scheduling thread " << thread_index << " to node " << current_node_index << ".";
-      thread_nodes[thread_index] = current_node_index;
-      ++thread_index;
-      if (thread_index == num_threads) {
-        /* All threads are scheduled on their nodes. */
-        return thread_nodes;
-      }
-    }
-    ++current_node_index;
-  }
-  /* Second pass: keep scheduling threads to each node one by one,
-   * uniformly filling them in.
-   * This is where things becomes tricky to predict for the maximum
-   * performance: on the one hand this avoids too much threading overhead on
-   * few nodes, but for the final performance having all the overhead on one
-   * node might be better idea (since other nodes will have better chance of
-   * rendering faster).
-   * But more tricky is that nodes might have difference capacity, so we might
-   * want to do some weighted scheduling. For example, if node 0 has 16
-   * processors and node 1 has 32 processors, we'd better schedule 1 extra
-   * thread on node 0 and 2 extra threads on node 1. */
-  current_node_index = 0;
-  while (thread_index < num_threads) {
-    /* Skip unavailable nodes. */
-    /* TODO(sergey): Add sanity check against deadlock. */
-    while (num_per_node_processors[current_node_index] == 0) {
-      current_node_index = (current_node_index + 1) % num_nodes;
-    }
-    VLOG(1) << "Scheduling thread " << thread_index << " to node " << current_node_index << ".";
-    ++thread_index;
-    current_node_index = (current_node_index + 1) % num_nodes;
-  }
-
-  return thread_nodes;
-}
-
-}  // namespace
+int TaskScheduler::active_num_threads = 0;
+tbb::global_control *TaskScheduler::global_control = nullptr;
 
 void TaskScheduler::init(int num_threads)
 {
@@ -320,22 +76,15 @@ void TaskScheduler::init(int num_threads)
   if (users != 1) {
     return;
   }
-  do_exit = false;
-  const bool use_auto_threads = (num_threads == 0);
-  if (use_auto_threads) {
+  if (num_threads > 0) {
     /* Automatic number of threads. */
-    num_threads = system_cpu_thread_count();
+    VLOG(1) << "Overriding number of TBB threads to " << num_threads << ".";
+    global_control = new tbb::global_control(tbb::global_control::max_allowed_parallelism,
+                                             num_threads);
+    active_num_threads = num_threads;
   }
-  VLOG(1) << "Creating pool of " << num_threads << " threads.";
-
-  /* Compute distribution on NUMA nodes. */
-  vector<int> thread_nodes = distribute_threads_on_nodes(num_threads);
-
-  /* Launch threads that will be waiting for work. */
-  threads.resize(num_threads);
-  for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
-    threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run, thread_index + 1),
-                                       thread_nodes[thread_index]);
+  else {
+    active_num_threads = system_cpu_thread_count();
   }
 }
 
@@ -344,105 +93,20 @@ void TaskScheduler::exit()
   thread_scoped_lock lock(mutex);
   users--;
   if (users == 0) {
-    VLOG(1) << "De-initializing thread pool of task scheduler.";
-    /* stop all waiting threads */
-    TaskScheduler::queue_mutex.lock();
-    do_exit = true;
-    TaskScheduler::queue_cond.notify_all();
-    TaskScheduler::queue_mutex.unlock();
-
-    /* delete threads */
-    foreach (thread *t, threads) {
-      t->join();
-      delete t;
-    }
-    threads.clear();
+    delete global_control;
+    global_control = nullptr;
+    active_num_threads = 0;
   }
 }
 
 void TaskScheduler::free_memory()
 {
   assert(users == 0);
-  threads.free_memory();
-}
-
-bool TaskScheduler::thread_wait_pop(Entry &entry)
-{
-  thread_scoped_lock queue_lock(queue_mutex);
-
-  while (queue.empty() && !do_exit)
-    queue_cond.wait(queue_lock);
-
-  if (queue.empty()) {
-    assert(do_exit);
-    return false;
-  }
-
-  entry = queue.front();
-  queue.pop_front();
-
-  return true;
 }
 
-void TaskScheduler::thread_run(int thread_id)
+int TaskScheduler::num_threads()
 {
-  Entry entry;
-
-  /* todo: test affinity/denormal mask */
-
-  /* keep popping off tasks */
-  while (thread_wait_pop(entry)) {
-    /* run task */
-    entry.task->run(thread_id);
-
-    /* delete task */
-    delete entry.task;
-
-    /* notify pool task was done */
-    entry.pool->num_decrease(1);
-  }
-}
-
-void TaskScheduler::push(Entry &entry, bool front)
-{
-  entry.pool->num_increase();
-
-  /* add entry to queue */
-  TaskScheduler::queue_mutex.lock();
-  if (front)
-    TaskScheduler::queue.push_front(entry);
-  else
-    TaskScheduler::queue.push_back(entry);
-
-  TaskScheduler::queue_cond.notify_one();
-  TaskScheduler::queue_mutex.unlock();
-}
-
-void TaskScheduler::clear(TaskPool *pool)
-{
-  thread_scoped_lock queue_lock(TaskScheduler::queue_mutex);
-
-  /* erase all tasks from this pool from the queue */
-  list<Entry>::iterator it = queue.begin();
-  int done = 0;
-
-  while (it != queue.end()) {
-    Entry &entry = *it;
-
-    if (entry.pool == pool) {
-      done++;
-      delete entry.task;
-
-      it = queue.erase(it);
-    }
-    else
-      it++;
-  }
-
-  queue_lock.unlock();
-
-  /* notify done */
-  pool->num_decrease(done);
+  return active_num_threads;
 }
 
 /* Dedicated Task Pool */
@@ -458,31 +122,30 @@ DedicatedTaskPool::DedicatedTaskPool()
 
 DedicatedTaskPool::~DedicatedTaskPool()
 {
-  stop();
+  wait();
+
+  do_exit = true;
+  queue_cond.notify_all();
+
   worker_thread->join();
   delete worker_thread;
 }
 
-void DedicatedTaskPool::push(Task *task, bool front)
+void DedicatedTaskPool::push(TaskRunFunction &&task, bool front)
 {
   num_increase();
 
   /* add task to queue */
   queue_mutex.lock();
   if (front)
-    queue.push_front(task);
+    queue.emplace_front(std::move(task));
   else
-    queue.push_back(task);
+    queue.emplace_back(std::move(task));
 
   queue_cond.notify_one();
   queue_mutex.unlock();
 }
 
-void DedicatedTaskPool::push(const TaskRunFunction &run, bool front)
-{
-  push(new Task(run), front);
-}
-
 void DedicatedTaskPool::wait()
 {
   thread_scoped_lock num_lock(num_mutex);
@@ -501,18 +164,6 @@ void DedicatedTaskPool::cancel()
   do_cancel = false;
 }
 
-void DedicatedTaskPool::stop()
-{
-  clear();
-
-  do_exit = true;
-  queue_cond.notify_all();
-
-  wait();
-
-  assert(num == 0);
-}
-
 bool DedicatedTaskPool::canceled()
 {
   return do_cancel;
@@ -535,7 +186,7 @@ void DedicatedTaskPool::num_increase()
   num_cond.notify_all();
 }
 
-bool DedicatedTaskPool::thread_wait_pop(Task *&task)
+bool DedicatedTaskPool::thread_wait_pop(TaskRunFunction &task)
 {
   thread_scoped_lock queue_lock(queue_mutex);
 
@@ -555,15 +206,15 @@ bool DedicatedTaskPool::thread_wait_pop(Task *&task)
 
 void DedicatedTaskPool::thread_run()
 {
-  Task *task;
+  TaskRunFunction task;
 
   /* keep popping off tasks */
   while (thread_wait_pop(task)) {
     /* run task */
-    task->run(0);
+    task();
 
     /* delete task */
-    delete task;
+    task = nullptr;
 
     /* notify task was done */
     num_decrease(1);
@@ -575,15 +226,8 @@ void DedicatedTaskPool::clear()
   thread_scoped_lock queue_lock(queue_mutex);
 
   /* erase all tasks from the queue */
-  list<Task *>::iterator it = queue.begin();
-  int done = 0;
-
-  while (it != queue.end()) {
-    done++;
-    delete *it;
-
-    it = queue.erase(it);
-  }
+  int done = queue.size();
+  queue.clear();
 
   queue_lock.unlock();
 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index fd30a33d8ef..a56ca62f62c 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -19,48 +19,16 @@
 
 #include "util/util_list.h"
 #include "util/util_string.h"
+#include "util/util_tbb.h"
 #include "util/util_thread.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
-class Task;
 class TaskPool;
 class TaskScheduler;
 
-/* Notes on Thread ID
- *
- * Thread ID argument reports the 0-based ID of a working thread from which
- * the run() callback is being invoked. Thread ID of 0 denotes the thread from
- * which wait_work() was called.
- *
- * DO NOT use this ID to control execution flaw, use it only for things like
- * emulating TLS which does not affect on scheduling. Don't use this ID to make
- * any decisions.
- *
- * It is to be noted here that dedicated task pool will always report thread ID
- * of 0.
- */
-
-typedef function<void(int thread_id)> TaskRunFunction;
-
-/* Task
- *
- * Base class for tasks to be executed in threads. */
-
-class Task {
- public:
-  Task(){};
-  explicit Task(const TaskRunFunction &run_) : run(run_)
-  {
-  }
-
-  virtual ~Task()
-  {
-  }
-
-  TaskRunFunction run;
-};
+typedef function<void(void)> TaskRunFunction;
 
 /* Task Pool
  *
@@ -68,8 +36,7 @@ class Task {
  * pool, we can wait for all tasks to be done, or cancel them before they are
  * done.
  *
- * The run callback that actually executes the task may be created like this:
- * function_bind(&MyClass::task_execute, this, _1, _2) */
+ * TaskRunFunction may be created with std::bind or lambda expressions. */
 
 class TaskPool {
  public:
@@ -89,27 +56,15 @@ class TaskPool {
   TaskPool();
   ~TaskPool();
 
-  void push(Task *task, bool front = false);
-  void push(const TaskRunFunction &run, bool front = false);
+  void push(TaskRunFunction &&task);
 
   void wait_work(Summary *stats = NULL); /* work and wait until all tasks are done */
-  void cancel();                         /* cancel all tasks, keep worker threads running */
-  void stop();                           /* stop all worker threads */
-  bool finished();                       /* check if all work has been completed */
+  void cancel(); /* cancel all tasks and wait until they are no longer executing */
 
   bool canceled(); /* for worker threads, test if canceled */
 
  protected:
-  friend class TaskScheduler;
-
-  void num_decrease(int done);
-  void num_increase();
-
-  thread_mutex num_mutex;
-  thread_condition_variable num_cond;
-
-  int num;
-  bool do_cancel;
+  tbb::task_group tbb_group;
 
   /* ** Statistics ** */
 
@@ -131,40 +86,19 @@ class TaskScheduler {
   static void exit();
   static void free_memory();
 
-  /* number of threads that can work on task */
-  static int num_threads()
-  {
-    return threads.size();
-  }
-
-  /* test if any session is using the scheduler */
-  static bool active()
-  {
-    return users != 0;
-  }
+  /* Approximate number of threads that will work on task, which may be lower
+   * or higher than the actual number of threads. Use as little as possible and
+   * leave splitting up tasks to the scheduler.. */
+  static int num_threads();
 
  protected:
-  friend class TaskPool;
-
-  struct Entry {
-    Task *task;
-    TaskPool *pool;
-  };
-
   static thread_mutex mutex;
   static int users;
-  static vector<thread *> threads;
-  static bool do_exit;
+  static int active_num_threads;
 
-  static list<Entry> queue;
-  static thread_mutex queue_mutex;
-  static thread_condition_variable queue_cond;
-
-  static void thread_run(int thread_id);
-  static bool thread_wait_pop(Entry &entry);
-
-  static void push(Entry &entry, bool front);
-  static void clear(TaskPool *pool);
+#ifdef WITH_TBB_GLOBAL_CONTROL
+  static tbb::global_control *global_control;
+#endif
 };
 
 /* Dedicated Task Pool
@@ -179,12 +113,10 @@ class DedicatedTaskPool {
   DedicatedTaskPool();
   ~DedicatedTaskPool();
 
-  void push(Task *task, bool front = false);
-  void push(const TaskRunFunction &run, bool front = false);
+  void push(TaskRunFunction &&run, bool front = false);
 
   void wait();   /* wait until all tasks are done */
   void cancel(); /* cancel all tasks, keep worker thread running */
-  void stop();   /* stop worker thread */
 
   bool canceled(); /* for worker thread, test if canceled */
 
@@ -193,14 +125,14 @@ class DedicatedTaskPool {
   void num_increase();
 
   void thread_run();
-  bool thread_wait_pop(Task *&entry);
+  bool thread_wait_pop(TaskRunFunction &task);
 
   void clear();
 
   thread_mutex num_mutex;
   thread_condition_variable num_cond;
 
-  list<Task *> queue;
+  list<TaskRunFunction> queue;
   thread_mutex queue_mutex;
   thread_condition_variable queue_cond;
 
diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h
new file mode 100644
index 00000000000..301cb80c5b0
--- /dev/null
+++ b/intern/cycles/util/util_tbb.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TBB_H__
+#define __UTIL_TBB_H__
+
+/* TBB includes <windows.h>, do it ourselves first so we are sure
+ * WIN32_LEAN_AND_MEAN and similar are defined beforehand. */
+#include "util_windows.h"
+
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#include <tbb/tbb.h>
+
+#if TBB_INTERFACE_VERSION_MAJOR >= 10
+#  define WITH_TBB_GLOBAL_CONTROL
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+using tbb::blocked_range;
+using tbb::enumerable_thread_specific;
+using tbb::parallel_for;
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TBB_H__ */
diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h
index bb2c99cc6d7..8bce5ff85aa 100644
--- a/intern/cycles/util/util_version.h
+++ b/intern/cycles/util/util_version.h
@@ -22,7 +22,7 @@
 CCL_NAMESPACE_BEGIN
 
 #define CYCLES_VERSION_MAJOR 1
-#define CYCLES_VERSION_MINOR 12
+#define CYCLES_VERSION_MINOR 13
 #define CYCLES_VERSION_PATCH 0
 
 #define CYCLES_MAKE_VERSION_STRING2(a, b, c) #a "." #b "." #c