259 files changed, 16290 insertions, 5429 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 806a8660e8c..c53a9f91cc0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -191,7 +191,7 @@ endif()
 # Logging capabilities using GLog library.
 if(WITH_CYCLES_LOGGING)
 	add_definitions(-DWITH_CYCLES_LOGGING)
-	add_definitions(-DGOOGLE_GLOG_DLL_DECL=)
+	add_definitions(${GLOG_DEFINES})
 	add_definitions(-DCYCLES_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE})
 	include_directories(
 		SYSTEM
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index aabb8f63640..08a3931ef46 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -35,18 +35,15 @@ if(WITH_CYCLES_OSL)
 	list(APPEND LIBRARIES cycles_kernel_osl)
 endif()
 
-if(CYCLES_STANDALONE_REPOSITORY)
-	if(WITH_CYCLES_LOGGING)
-		list(APPEND LIBRARIES
-			${GLOG_LIBRARIES}
-			${GFLAGS_LIBRARIES}
-		)
-	endif()
-else()
+if(NOT CYCLES_STANDALONE_REPOSITORY)
 	list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc)
-	if(WITH_CYCLES_LOGGING)
-		list(APPEND LIBRARIES extern_glog extern_gflags)
-	endif()
+endif()
+
+if(WITH_CYCLES_LOGGING)
+	list(APPEND LIBRARIES
+		${GLOG_LIBRARIES}
+		${GFLAGS_LIBRARIES}
+	)
 endif()
 
 if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index eb792af7264..a2d6262fb20 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -102,6 +102,9 @@ class CyclesRender(bpy.types.RenderEngine):
         else:
             self.report({'ERROR'}, "OSL support disabled in this build.")
 
+    def update_render_passes(self, scene, srl):
+        engine.register_passes(self, scene, srl)
+
 
 def engine_exit():
     engine.exit()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index ab57dd44bdb..3018fd5b316 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -205,3 +205,49 @@ def with_network():
 def system_info():
     import _cycles
     return _cycles.system_info()
+
+def register_passes(engine, scene, srl):
+    engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR')
+
+    if srl.use_pass_z:                     engine.register_pass(scene, srl, "Depth",         1, "Z",    'VALUE')
+    if srl.use_pass_mist:                  engine.register_pass(scene, srl, "Mist",          1, "Z",    'VALUE')
+    if srl.use_pass_normal:                engine.register_pass(scene, srl, "Normal",        3, "XYZ",  'VECTOR')
+    if srl.use_pass_vector:                engine.register_pass(scene, srl, "Vector",        4, "XYZW", 'VECTOR')
+    if srl.use_pass_uv:                    engine.register_pass(scene, srl, "UV",            3, "UVA",  'VECTOR')
+    if srl.use_pass_object_index:          engine.register_pass(scene, srl, "IndexOB",       1, "X",    'VALUE')
+    if srl.use_pass_material_index:        engine.register_pass(scene, srl, "IndexMA",       1, "X",    'VALUE')
+    if srl.use_pass_shadow:                engine.register_pass(scene, srl, "Shadow",        3, "RGB",  'COLOR')
+    if srl.use_pass_ambient_occlusion:     engine.register_pass(scene, srl, "AO",            3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_direct:        engine.register_pass(scene, srl, "DiffDir",       3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_indirect:      engine.register_pass(scene, srl, "DiffInd",       3, "RGB",  'COLOR')
+    if srl.use_pass_diffuse_color:         engine.register_pass(scene, srl, "DiffCol",       3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_direct:         engine.register_pass(scene, srl, "GlossDir",      3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_indirect:       engine.register_pass(scene, srl, "GlossInd",      3, "RGB",  'COLOR')
+    if srl.use_pass_glossy_color:          engine.register_pass(scene, srl, "GlossCol",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_direct:   engine.register_pass(scene, srl, "TransDir",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd",      3, "RGB",  'COLOR')
+    if srl.use_pass_transmission_color:    engine.register_pass(scene, srl, "TransCol",      3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_direct:     engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_indirect:   engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB",  'COLOR')
+    if srl.use_pass_subsurface_color:      engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB",  'COLOR')
+    if srl.use_pass_emit:                  engine.register_pass(scene, srl, "Emit",          3, "RGB",  'COLOR')
+    if srl.use_pass_environment:           engine.register_pass(scene, srl, "Env",           3, "RGB",  'COLOR')
+
+    crl = srl.cycles
+    if crl.pass_debug_bvh_traversed_nodes:     engine.register_pass(scene, srl, "Debug BVH Traversed Nodes",     1, "X", 'VALUE')
+    if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
+    if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X", 'VALUE')
+    if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X", 'VALUE')
+
+    cscene = scene.cycles
+    if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine:
+        engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
+        engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
+        engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE')
+        engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE')
+        engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR')
+        engine.register_pass(scene, srl, "Denoising Image",           3, "RGB", 'COLOR')
+        engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index cbf469b3a89..68474529ed3 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -695,10 +695,17 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             update=devices_update_callback
             )
 
-        cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback);
+        cls.debug_opencl_kernel_single_program = BoolProperty(
+            name="Single Program",
+            default=True,
+            update=devices_update_callback,
+            )
 
         cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
 
+        cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0,
+            description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)")
+
     @classmethod
     def unregister(cls):
         del bpy.types.Scene.cycles
@@ -1166,6 +1173,125 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
     def unregister(cls):
         del bpy.types.Scene.cycles_curves
 
+def update_render_passes(self, context):
+    scene = context.scene
+    rd = scene.render
+    rl = rd.layers.active
+    rl.update_render_passes()
+
+class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
+    @classmethod
+    def register(cls):
+        bpy.types.SceneRenderLayer.cycles = PointerProperty(
+                name="Cycles SceneRenderLayer Settings",
+                description="Cycles SceneRenderLayer Settings",
+                type=cls,
+                )
+        cls.pass_debug_bvh_traversed_nodes = BoolProperty(
+                name="Debug BVH Traversed Nodes",
+                description="Store Debug BVH Traversed Nodes pass",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_bvh_traversed_instances = BoolProperty(
+                name="Debug BVH Traversed Instances",
+                description="Store Debug BVH Traversed Instances pass",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_bvh_intersections = BoolProperty(
+                name="Debug BVH Intersections",
+                description="Store Debug BVH Intersections",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_debug_ray_bounces = BoolProperty(
+                name="Debug Ray Bounces",
+                description="Store Debug Ray Bounces pass",
+                default=False,
+                update=update_render_passes,
+                )
+
+        cls.use_denoising = BoolProperty(
+                name="Use Denoising",
+                description="Denoise the rendered image",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.denoising_diffuse_direct = BoolProperty(
+                name="Diffuse Direct",
+                description="Denoise the direct diffuse lighting",
+                default=True,
+                )
+        cls.denoising_diffuse_indirect = BoolProperty(
+                name="Diffuse Indirect",
+                description="Denoise the indirect diffuse lighting",
+                default=True,
+                )
+        cls.denoising_glossy_direct = BoolProperty(
+                name="Glossy Direct",
+                description="Denoise the direct glossy lighting",
+                default=True,
+                )
+        cls.denoising_glossy_indirect = BoolProperty(
+                name="Glossy Indirect",
+                description="Denoise the indirect glossy lighting",
+                default=True,
+                )
+        cls.denoising_transmission_direct = BoolProperty(
+                name="Transmission Direct",
+                description="Denoise the direct transmission lighting",
+                default=True,
+                )
+        cls.denoising_transmission_indirect = BoolProperty(
+                name="Transmission Indirect",
+                description="Denoise the indirect transmission lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_direct = BoolProperty(
+                name="Subsurface Direct",
+                description="Denoise the direct subsurface lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_indirect = BoolProperty(
+                name="Subsurface Indirect",
+                description="Denoise the indirect subsurface lighting",
+                default=True,
+                )
+        cls.denoising_strength = FloatProperty(
+                name="Denoising Strength",
+                description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_feature_strength = FloatProperty(
+                name="Denoising Feature Strength",
+                description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_radius = IntProperty(
+                name="Denoising Radius",
+                description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
+                min=1, max=25,
+                default=8,
+        )
+        cls.denoising_relative_pca = BoolProperty(
+                name="Relative filter",
+                description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
+                default=False,
+        )
+        cls.denoising_store_passes = BoolProperty(
+                name="Store denoising passes",
+                description="Store the denoising feature passes and the noisy image",
+                default=False,
+                update=update_render_passes,
+        )
+
+    @classmethod
+    def unregister(cls):
+        del bpy.types.SceneRenderLayer.cycles
+
 
 class CyclesCurveSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -1297,14 +1423,14 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         row = layout.row()
 
         if self.compute_device_type == 'CUDA' and cuda_devices:
-            col = row.column(align=True)
+            box = row.box()
             for device in cuda_devices:
-                col.prop(device, "use", text=device.name, toggle=True)
+                box.prop(device, "use", text=device.name)
 
         if self.compute_device_type == 'OPENCL' and opencl_devices:
-            col = row.column(align=True)
+            box = row.box()
             for device in opencl_devices:
-                col.prop(device, "use", text=device.name, toggle=True)
+                box.prop(device, "use", text=device.name)
 
 
     def draw(self, context):
@@ -1324,6 +1450,7 @@ def register():
     bpy.utils.register_class(CyclesCurveSettings)
     bpy.utils.register_class(CyclesDeviceSettings)
     bpy.utils.register_class(CyclesPreferences)
+    bpy.utils.register_class(CyclesRenderLayerSettings)
 
 
 def unregister():
@@ -1339,3 +1466,4 @@ def unregister():
     bpy.utils.unregister_class(CyclesCurveSettings)
     bpy.utils.unregister_class(CyclesDeviceSettings)
     bpy.utils.unregister_class(CyclesPreferences)
+    bpy.utils.unregister_class(CyclesRenderLayerSettings)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 2b50d272be8..49beebe5ab4 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')
 
 
 def use_sample_all_lights(context):
@@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
@@ -204,8 +203,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             col.prop(cscene, "sample_all_lights_direct")
             col.prop(cscene, "sample_all_lights_indirect")
 
-        if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'):
-            layout.row().prop(cscene, "sampling_pattern", text="Pattern")
+        layout.row().prop(cscene, "sampling_pattern", text="Pattern")
 
         for rl in scene.render.layers:
             if rl.samples > 0:
@@ -478,11 +476,14 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
     bl_options = {'DEFAULT_CLOSED'}
 
     def draw(self, context):
+        import _cycles
+
         layout = self.layout
 
         scene = context.scene
         rd = scene.render
         rl = rd.layers.active
+        crl = rl.cycles
 
         split = layout.split()
 
@@ -529,8 +530,18 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
-        if hasattr(rd, "debug_pass_type"):
-            layout.prop(rd, "debug_pass_type")
+        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+            col.separator()
+            sub = col.column()
+            sub.active = crl.use_denoising
+            sub.prop(crl, "denoising_store_passes", text="Denoising")
+
+        if _cycles.with_cycles_debug:
+            col = layout.column()
+            col.prop(crl, "pass_debug_bvh_traversed_nodes")
+            col.prop(crl, "pass_debug_bvh_traversed_instances")
+            col.prop(crl, "pass_debug_bvh_intersections")
+            col.prop(crl, "pass_debug_ray_bounces")
 
 
 class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
@@ -576,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
             row.prop(rv, "camera_suffix", text="")
 
 
+class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
+    bl_label = "Denoising"
+    bl_context = "render_layer"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+        cscene = context.scene.cycles
+        layout = self.layout
+
+        layout.active = not cscene.use_progressive_refine
+        layout.prop(crl, "use_denoising", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        cscene = scene.cycles
+        rd = scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+
+        layout.active = crl.use_denoising and not cscene.use_progressive_refine
+
+        split = layout.split()
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_radius", text="Radius")
+        sub.prop(crl, "denoising_strength", slider=True, text="Strength")
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
+        sub.prop(crl, "denoising_relative_pca")
+
+        layout.separator()
+
+        row = layout.row()
+        row.label(text="Diffuse:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Glossy:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Transmission:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Subsurface:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
+
+
 class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1532,6 +1608,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         col.prop(cscene, "debug_opencl_device_type", text="Device")
         col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
+        col.prop(cscene, "debug_opencl_mem_limit")
 
 
 class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
@@ -1634,7 +1711,7 @@ def draw_device(self, context):
 
         layout.prop(cscene, "feature_set")
 
-        split = layout.split(percentage=1/3)
+        split = layout.split(percentage=1 / 3)
         split.label("Device:")
         row = split.row()
         row.active = show_device_active(context)
@@ -1729,6 +1806,7 @@ classes = (
     CyclesRender_PT_layer_options,
     CyclesRender_PT_layer_passes,
     CyclesRender_PT_views,
+    CyclesRender_PT_denoising,
     Cycles_PT_post_processing,
     CyclesCamera_PT_dof,
     Cycles_PT_context_material,
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 6fa038e8bf0..42b985305ea 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -411,7 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}
 
-	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
+	mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -546,7 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}
 
-	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
+	mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -776,17 +776,17 @@ static void ExportCurveTriangleVcol(ParticleCurveData *CData,
 
 			for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) {
 				for(int section = 0; section < resol; section++) {
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
-					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
+					cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve]));
 					vertexindex++;
 				}
 			}
@@ -1004,7 +1004,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 
 					for(size_t curve = 0; curve < CData.curve_vcol.size(); curve++)
 						if(!(CData.curve_keynum[curve] <= 1 || CData.curve_length[curve] == 0.0f))
-							fdata[i++] = color_srgb_to_scene_linear(CData.curve_vcol[curve]);
+							fdata[i++] = color_srgb_to_scene_linear_v3(CData.curve_vcol[curve]);
 				}
 			}
 		}
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 54571b1fea1..b4cca5f00f4 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
- 
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -51,8 +50,7 @@ enum {
  * Two triangles has vertex indices in the original Blender-side face.
  * If face is already a quad tri_b will not be initialized.
  */
-inline void face_split_tri_indices(const int num_verts,
-                                   const int face_flag,
+inline void face_split_tri_indices(const int face_flag,
                                    int tri_a[3],
                                    int tri_b[3])
 {
@@ -60,21 +58,19 @@ inline void face_split_tri_indices(const int num_verts,
 		tri_a[0] = 0;
 		tri_a[1] = 1;
 		tri_a[2] = 3;
-		if(num_verts == 4) {
-			tri_b[0] = 2;
-			tri_b[1] = 3;
-			tri_b[2] = 1;
-		}
+
+		tri_b[0] = 2;
+		tri_b[1] = 3;
+		tri_b[2] = 1;
 	}
 	else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ {
 		tri_a[0] = 0;
 		tri_a[1] = 1;
 		tri_a[2] = 2;
-		if(num_verts == 4) {
-			tri_b[0] = 0;
-			tri_b[1] = 2;
-			tri_b[2] = 3;
-		}
+
+		tri_b[0] = 0;
+		tri_b[1] = 2;
+		tri_b[2] = 3;
 	}
 }
 
@@ -251,7 +247,7 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh,
 
 	for(int i = 0; i < nverts.size(); i++) {
 		int tri_a[3], tri_b[3];
-		face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+		face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 		tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]);
 		tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]);
@@ -293,7 +289,7 @@ static void create_mesh_volume_attribute(BL::Object& b_ob,
 
 	if(!b_domain)
 		return;
-	
+
 	Attribute *attr = mesh->attributes.add(std);
 	VoxelAttribute *volume_data = attr->data_voxel();
 	bool is_float, is_linear;
@@ -356,7 +352,7 @@ static void attr_create_vertex_color(Scene *scene,
 				int n = p->loop_total();
 				for(int i = 0; i < n; i++) {
 					float3 color = get_float3(l->data[p->loop_start() + i].color());
-					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color));
+					*(cdata++) = color_float_to_byte(color_srgb_to_scene_linear_v3(color));
 				}
 			}
 		}
@@ -377,14 +373,14 @@ static void attr_create_vertex_color(Scene *scene,
 
 			for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
 				int tri_a[3], tri_b[3];
-				face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+				face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 				uchar4 colors[4];
-				colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
-				colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
-				colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
+				colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1())));
+				colors[1] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color2())));
+				colors[2] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color3())));
 				if(nverts[i] == 4) {
-					colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+					colors[3] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color4())));
 				}
 
 				cdata[0] = colors[tri_a[0]];
@@ -470,7 +466,7 @@ static void attr_create_uv_map(Scene *scene,
 
 				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
 					int tri_a[3], tri_b[3];
-					face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+					face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 					float3 uvs[4];
 					uvs[0] = get_float3(t->uv1());
@@ -982,7 +978,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 		else
 			used_shaders.push_back(scene->default_surface);
 	}
-	
+
 	/* test if we need to sync */
 	int requested_geometry_flags = Mesh::GEOMETRY_NONE;
 	if(render_layer.use_surfaces) {
@@ -1017,12 +1013,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	/* ensure we only sync instanced meshes once */
 	if(mesh_synced.find(mesh) != mesh_synced.end())
 		return mesh;
-	
+
 	mesh_synced.insert(mesh);
 
 	/* create derived mesh */
 	array<int> oldtriangle = mesh->triangles;
-	
+
 	/* compares curve_keys rather than strands in order to handle quick hair
 	 * adjustments in dynamic BVH - other methods could probably do this better*/
 	array<float3> oldcurve_keys = mesh->curve_keys;
@@ -1111,7 +1107,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 		if(memcmp(&oldcurve_radius[0], &mesh->curve_radius[0], sizeof(float)*oldcurve_radius.size()) != 0)
 			rebuild = true;
 	}
-	
+
 	mesh->tag_update(scene, rebuild);
 
 	return mesh;
@@ -1140,7 +1136,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 	if(scene->need_motion() == Scene::MOTION_BLUR) {
 		if(!mesh->use_motion_blur)
 			return;
-		
+
 		/* see if this mesh needs motion data at this time */
 		vector<float> object_times = object->motion_times();
 		bool found = false;
@@ -1172,7 +1168,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 
 	if(!numverts && !numkeys)
 		return;
-	
+
 	/* skip objects without deforming modifiers. this is not totally reliable,
 	 * would need a more extensive check to see which objects are animated */
 	BL::Mesh b_mesh(PointerRNA_NULL);
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index d05699236cc..a930c439370 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -379,27 +379,16 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 			}
 		}
 
-		/* random number */
-		object->random_id = hash_string(object->name.c_str());
-
-		if(persistent_id) {
-			for(int i = 0; i < OBJECT_PERSISTENT_ID_SIZE; i++)
-				object->random_id = hash_int_2d(object->random_id, persistent_id[i]);
-		}
-		else
-			object->random_id = hash_int_2d(object->random_id, 0);
-
-		if(b_parent.ptr.data != b_ob.ptr.data)
-			object->random_id ^= hash_int(hash_string(b_parent.name().c_str()));
-
-		/* dupli texture coordinates */
+		/* dupli texture coordinates and random_id */
 		if(b_dupli_ob) {
 			object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f);
 			object->dupli_uv = get_float2(b_dupli_ob.uv());
+			object->random_id = b_dupli_ob.random_id();
 		}
 		else {
 			object->dupli_generated = make_float3(0.0f, 0.0f, 0.0f);
 			object->dupli_uv = make_float2(0.0f, 0.0f);
+			object->random_id =  hash_int_2d(hash_string(object->name.c_str()), 0);
 		}
 
 		object->tag_update(scene);
@@ -489,7 +478,7 @@ static bool object_render_hide_duplis(BL::Object& b_ob)
 
 /* Object Loop */
 
-void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
+void BlenderSync::sync_objects(float motion_time)
 {
 	/* layer data */
 	uint scene_layer = render_layer.scene_layer;
@@ -517,7 +506,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 	 * 1 : DAG_EVAL_PREVIEW
 	 * 2 : DAG_EVAL_RENDER
 	 */
-	int dupli_settings = preview ? 1 : 2;
+	int dupli_settings = (render_layer.use_viewport_visibility) ? 1 : 2;
 
 	bool cancel = false;
 	bool use_portal = false;
@@ -552,7 +541,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 					for(b_ob.dupli_list.begin(b_dup); b_dup != b_ob.dupli_list.end(); ++b_dup) {
 						Transform tfm = get_transform(b_dup->matrix());
 						BL::Object b_dup_ob = b_dup->object();
-						bool dup_hide = (b_v3d)? b_dup_ob.hide(): b_dup_ob.hide_render();
+						bool dup_hide = (render_layer.use_viewport_visibility)? b_dup_ob.hide(): b_dup_ob.hide_render();
 						bool in_dupli_group = (b_dup->type() == BL::DupliObject::type_GROUP);
 						bool hide_tris;
 
@@ -628,7 +617,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 }
 
 void BlenderSync::sync_motion(BL::RenderSettings& b_render,
-                              BL::SpaceView3D& b_v3d,
                               BL::Object& b_override,
                               int width, int height,
                               void **python_thread_state)
@@ -665,7 +653,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
 		b_engine.frame_set(frame, subframe);
 		python_thread_state_save(python_thread_state);
 		sync_camera_motion(b_render, b_cam, width, height, 0.0f);
-		sync_objects(b_v3d, 0.0f);
+		sync_objects(0.0f);
 	}
 
 	/* always sample these times for camera motion */
@@ -699,7 +687,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render,
 		}
 
 		/* sync object */
-		sync_objects(b_v3d, relative_time);
+		sync_objects(relative_time);
 	}
 
 	/* we need to set the python thread state again because this
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index d509e9de981..54973fd1b7f 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -106,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+	flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024;
 	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
@@ -811,6 +812,14 @@ void *CCL_python_module_init()
 	PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
 #endif
 
+#ifdef WITH_CYCLES_DEBUG
+	PyModule_AddObject(mod, "with_cycles_debug", Py_True);
+	Py_INCREF(Py_True);
+#else
+	PyModule_AddObject(mod, "with_cycles_debug", Py_False);
+	Py_INCREF(Py_False);
+#endif
+
 #ifdef WITH_NETWORK
 	PyModule_AddObject(mod, "with_network", Py_True);
 	Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 26f9bccd95d..12de3da063f 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -129,9 +129,9 @@ void BlenderSession::create_session()
 	scene = new Scene(scene_params, session_params.device);
 
 	/* setup callbacks for builtin image support */
-	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
-	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4);
-	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4);
+	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7, _8);
+	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5);
+	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5);
 
 	/* create session */
 	session = new Session(session_params);
@@ -243,90 +243,6 @@ void BlenderSession::free_session()
 	delete session;
 }
 
-static PassType get_pass_type(BL::RenderPass& b_pass)
-{
-	switch(b_pass.type()) {
-		case BL::RenderPass::type_COMBINED:
-			return PASS_COMBINED;
-
-		case BL::RenderPass::type_Z:
-			return PASS_DEPTH;
-		case BL::RenderPass::type_MIST:
-			return PASS_MIST;
-		case BL::RenderPass::type_NORMAL:
-			return PASS_NORMAL;
-		case BL::RenderPass::type_OBJECT_INDEX:
-			return PASS_OBJECT_ID;
-		case BL::RenderPass::type_UV:
-			return PASS_UV;
-		case BL::RenderPass::type_VECTOR:
-			return PASS_MOTION;
-		case BL::RenderPass::type_MATERIAL_INDEX:
-			return PASS_MATERIAL_ID;
-
-		case BL::RenderPass::type_DIFFUSE_DIRECT:
-			return PASS_DIFFUSE_DIRECT;
-		case BL::RenderPass::type_GLOSSY_DIRECT:
-			return PASS_GLOSSY_DIRECT;
-		case BL::RenderPass::type_TRANSMISSION_DIRECT:
-			return PASS_TRANSMISSION_DIRECT;
-		case BL::RenderPass::type_SUBSURFACE_DIRECT:
-			return PASS_SUBSURFACE_DIRECT;
-
-		case BL::RenderPass::type_DIFFUSE_INDIRECT:
-			return PASS_DIFFUSE_INDIRECT;
-		case BL::RenderPass::type_GLOSSY_INDIRECT:
-			return PASS_GLOSSY_INDIRECT;
-		case BL::RenderPass::type_TRANSMISSION_INDIRECT:
-			return PASS_TRANSMISSION_INDIRECT;
-		case BL::RenderPass::type_SUBSURFACE_INDIRECT:
-			return PASS_SUBSURFACE_INDIRECT;
-
-		case BL::RenderPass::type_DIFFUSE_COLOR:
-			return PASS_DIFFUSE_COLOR;
-		case BL::RenderPass::type_GLOSSY_COLOR:
-			return PASS_GLOSSY_COLOR;
-		case BL::RenderPass::type_TRANSMISSION_COLOR:
-			return PASS_TRANSMISSION_COLOR;
-		case BL::RenderPass::type_SUBSURFACE_COLOR:
-			return PASS_SUBSURFACE_COLOR;
-
-		case BL::RenderPass::type_EMIT:
-			return PASS_EMISSION;
-		case BL::RenderPass::type_ENVIRONMENT:
-			return PASS_BACKGROUND;
-		case BL::RenderPass::type_AO:
-			return PASS_AO;
-		case BL::RenderPass::type_SHADOW:
-			return PASS_SHADOW;
-
-		case BL::RenderPass::type_DIFFUSE:
-		case BL::RenderPass::type_COLOR:
-		case BL::RenderPass::type_REFRACTION:
-		case BL::RenderPass::type_SPECULAR:
-		case BL::RenderPass::type_REFLECTION:
-			return PASS_NONE;
-#ifdef WITH_CYCLES_DEBUG
-		case BL::RenderPass::type_DEBUG:
-		{
-			switch(b_pass.debug_type()) {
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES:
-					return PASS_BVH_TRAVERSED_NODES;
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES:
-					return PASS_BVH_TRAVERSED_INSTANCES;
-				case BL::RenderPass::debug_type_BVH_INTERSECTIONS:
-					return PASS_BVH_INTERSECTIONS;
-				case BL::RenderPass::debug_type_RAY_BOUNCES:
-					return PASS_RAY_BOUNCES;
-			}
-			break;
-		}
-#endif
-	}
-	
-	return PASS_NONE;
-}
-
 static ShaderEvalType get_shader_type(const string& pass_type)
 {
 	const char *shader_type = pass_type.c_str();
@@ -383,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
 static void end_render_result(BL::RenderEngine& b_engine,
                               BL::RenderResult& b_rr,
                               bool cancel,
+                              bool highlight,
                               bool do_merge_results)
 {
-	b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
+	b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
 }
 
-void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
+void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
 {
 	BufferParams& params = rtile.buffers->params;
 	int x = params.full_x - session->tile_manager.params.full_x;
@@ -424,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 			update_render_result(b_rr, b_rlay, rtile);
 		}
 
-		end_render_result(b_engine, b_rr, true, true);
+		end_render_result(b_engine, b_rr, true, highlight, true);
 	}
 	else {
 		/* write result */
 		write_render_result(b_rr, b_rlay, rtile);
-		end_render_result(b_engine, b_rr, false, true);
+		end_render_result(b_engine, b_rr, false, false, true);
 	}
 }
 
 void BlenderSession::write_render_tile(RenderTile& rtile)
 {
-	do_write_update_render_tile(rtile, false);
+	do_write_update_render_tile(rtile, false, false);
 }
 
-void BlenderSession::update_render_tile(RenderTile& rtile)
+void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
 {
 	/* use final write for preview renders, otherwise render result wouldn't be
 	 * be updated in blender side
 	 * would need to be investigated a bit further, but for now shall be fine
 	 */
 	if(!b_engine.is_preview())
-		do_write_update_render_tile(rtile, true);
+		do_write_update_render_tile(rtile, true, highlight);
 	else
-		do_write_update_render_tile(rtile, false);
+		do_write_update_render_tile(rtile, false, false);
 }
 
 void BlenderSession::render()
 {
 	/* set callback to write out render results */
 	session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
+	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@@ -475,33 +392,38 @@ void BlenderSession::render()
 
 		/* layer will be missing if it was disabled in the UI */
 		if(b_single_rlay == b_rr.layers.end()) {
-			end_render_result(b_engine, b_rr, true, false);
+			end_render_result(b_engine, b_rr, true, true, false);
 			continue;
 		}
 
 		BL::RenderLayer b_rlay = *b_single_rlay;
 
 		/* add passes */
-		array<Pass> passes;
-		Pass::add(PASS_COMBINED, passes);
-
-		if(session_params.device.advanced_shading) {
-
-			/* loop over passes */
-			BL::RenderLayer::passes_iterator b_pass_iter;
-
-			for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
-				BL::RenderPass b_pass(*b_pass_iter);
-				PassType pass_type = get_pass_type(b_pass);
+		array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
+		buffer_params.passes = passes;
 
-				if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
-					continue;
-				if(pass_type != PASS_NONE)
-					Pass::add(pass_type, passes);
-			}
-		}
+		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
+		bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
+		buffer_params.denoising_data_pass = use_denoising;
+		session->tile_manager.schedule_denoising = use_denoising;
+		session->params.use_denoising = use_denoising;
+		scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+		scene->film->denoising_flags = 0;
+		if(!get_boolean(crl, "denoising_diffuse_direct"))        scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
+		if(!get_boolean(crl, "denoising_diffuse_indirect"))      scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
+		if(!get_boolean(crl, "denoising_glossy_direct"))         scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
+		if(!get_boolean(crl, "denoising_glossy_indirect"))       scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
+		if(!get_boolean(crl, "denoising_transmission_direct"))   scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
+		if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
+		if(!get_boolean(crl, "denoising_subsurface_direct"))     scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
+		if(!get_boolean(crl, "denoising_subsurface_indirect"))   scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
+		scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+		buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
+		session->params.denoising_radius = get_int(crl, "denoising_radius");
+		session->params.denoising_strength = get_float(crl, "denoising_strength");
+		session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
+		session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
 
-		buffer_params.passes = passes;
 		scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
 		scene->film->tag_passes_update(scene, passes);
 		scene->film->tag_update(scene);
@@ -555,7 +477,7 @@ void BlenderSession::render()
 		}
 
 		/* free result without merging */
-		end_render_result(b_engine, b_rr, true, false);
+		end_render_result(b_engine, b_rr, true, true, false);
 
 		if(session->progress.get_cancel())
 			break;
@@ -636,8 +558,6 @@ void BlenderSession::bake(BL::Object& b_object,
                           float result[])
 {
 	ShaderEvalType shader_type = get_shader_type(pass_type);
-	size_t object_index = OBJECT_NONE;
-	int tri_offset = 0;
 
 	/* Set baking flag in advance, so kernel loading can check if we need
 	 * any baking capabilities.
@@ -647,9 +567,6 @@ void BlenderSession::bake(BL::Object& b_object,
 	/* ensure kernels are loaded before we do any scene updates */
 	session->load_kernels();
 
-	if(session->progress.get_cancel())
-		return;
-
 	if(shader_type == SHADER_EVAL_UV) {
 		/* force UV to be available */
 		Pass::add(PASS_UV, scene->film->passes);
@@ -667,50 +584,61 @@ void BlenderSession::bake(BL::Object& b_object,
 	scene->film->tag_update(scene);
 	scene->integrator->tag_update(scene);
 
-	/* update scene */
-	BL::Object b_camera_override(b_engine.camera_override());
-	sync->sync_camera(b_render, b_camera_override, width, height, "");
-	sync->sync_data(b_render,
-	                b_v3d,
-	                b_camera_override,
-	                width, height,
-	                &python_thread_state,
-	                b_rlay_name.c_str());
+	if(!session->progress.get_cancel()) {
+		/* update scene */
+		BL::Object b_camera_override(b_engine.camera_override());
+		sync->sync_camera(b_render, b_camera_override, width, height, "");
+		sync->sync_data(b_render,
+						b_v3d,
+						b_camera_override,
+						width, height,
+						&python_thread_state,
+						b_rlay_name.c_str());
+	}
 
-	/* get buffer parameters */
-	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
-	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
+	BakeData *bake_data = NULL;
+
+	if(!session->progress.get_cancel()) {
+		/* get buffer parameters */
+		SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+		BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
-	scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
+		scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
 
-	/* set number of samples */
-	session->tile_manager.set_samples(session_params.samples);
-	session->reset(buffer_params, session_params.samples);
-	session->update_scene();
+		/* set number of samples */
+		session->tile_manager.set_samples(session_params.samples);
+		session->reset(buffer_params, session_params.samples);
+		session->update_scene();
 
-	/* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
-	for(size_t i = 0; i < scene->objects.size(); i++) {
-		if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
-			object_index = i;
-			tri_offset = scene->objects[i]->mesh->tri_offset;
-			break;
-		}
-	}
+		/* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
+		size_t object_index = OBJECT_NONE;
+		int tri_offset = 0;
 
-	int object = object_index;
+		for(size_t i = 0; i < scene->objects.size(); i++) {
+			if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
+				object_index = i;
+				tri_offset = scene->objects[i]->mesh->tri_offset;
+				break;
+			}
+		}
 
-	BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+		int object = object_index;
 
-	populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
+		bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+		populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
 
-	/* set number of samples */
-	session->tile_manager.set_samples(session_params.samples);
-	session->reset(buffer_params, session_params.samples);
-	session->update_scene();
+		/* set number of samples */
+		session->tile_manager.set_samples(session_params.samples);
+		session->reset(buffer_params, session_params.samples);
+		session->update_scene();
 
-	session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+		session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+	}
 
-	scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result);
+	/* Perform bake. Check cancel to avoid crash with incomplete scene data. */
+	if(!session->progress.get_cancel()) {
+		scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result);
+	}
 
 	/* free all memory used (host and device), so we wouldn't leave render
 	 * engine with extra memory allocated
@@ -753,19 +681,31 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 			BL::RenderPass b_pass(*b_iter);
 
 			/* find matching pass type */
-			PassType pass_type = get_pass_type(b_pass);
+			PassType pass_type = BlenderSync::get_pass_type(b_pass);
 			int components = b_pass.channels();
 
-			/* copy pixels */
-			if(!buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]))
+			bool read = false;
+			if(pass_type != PASS_NONE) {
+				/* copy pixels */
+				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
+			}
+			else {
+				int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
+				if(denoising_offset >= 0) {
+					read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
+				}
+			}
+
+			if(!read) {
 				memset(&pixels[0], 0, pixels.size()*sizeof(float));
+			}
 
 			b_pass.rect(&pixels[0]);
 		}
 	}
 	else {
 		/* copy combined pass */
-		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str()));
+		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
 		if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0]))
 			b_combined_pass.rect(&pixels[0]);
 	}
@@ -1073,7 +1013,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
                                         int &width,
                                         int &height,
                                         int &depth,
-                                        int &channels)
+                                        int &channels,
+                                        bool& free_cache)
 {
 	/* empty image */
 	is_float = false;
@@ -1081,6 +1022,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 	height = 1;
 	depth = 0;
 	channels = 0;
+	free_cache = false;
 
 	if(!builtin_data)
 		return;
@@ -1094,6 +1036,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 		/* image data */
 		BL::Image b_image(b_id);
 
+		free_cache = !b_image.has_data();
 		is_float = b_image.is_float();
 		width = b_image.size()[0];
 		height = b_image.size()[1];
@@ -1154,7 +1097,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 bool BlenderSession::builtin_image_pixels(const string &builtin_name,
                                           void *builtin_data,
                                           unsigned char *pixels,
-                                          const size_t pixels_size)
+                                          const size_t pixels_size,
+                                          const bool free_cache)
 {
 	if(!builtin_data) {
 		return false;
@@ -1175,7 +1119,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 
 	if(image_pixels && num_pixels * channels == pixels_size) {
 		memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
-		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
@@ -1194,6 +1137,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 			}
 		}
 	}
+
+	if(image_pixels) {
+		MEM_freeN(image_pixels);
+	}
+
+	/* Free image buffers to save memory during render. */
+	if(free_cache) {
+		b_image.buffers_free();
+	}
+
 	/* Premultiply, byte images are always straight for Blender. */
 	unsigned char *cp = pixels;
 	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
@@ -1207,7 +1160,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
                                                 void *builtin_data,
                                                 float *pixels,
-                                                const size_t pixels_size)
+                                                const size_t pixels_size,
+                                                const bool free_cache)
 {
 	if(!builtin_data) {
 		return false;
@@ -1232,7 +1186,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 
 		if(image_pixels && num_pixels * channels == pixels_size) {
 			memcpy(pixels, image_pixels, pixels_size * sizeof(float));
-			MEM_freeN(image_pixels);
 		}
 		else {
 			if(channels == 1) {
@@ -1252,6 +1205,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 			}
 		}
 
+		if(image_pixels) {
+			MEM_freeN(image_pixels);
+		}
+
+		/* Free image buffers to save memory during render. */
+		if(free_cache) {
+			b_image.buffers_free();
+		}
+
 		return true;
 	}
 	else if(b_id.is_a(&RNA_Object)) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 22b21a18f2e..cbd2303d282 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -79,7 +79,7 @@ public:
 	void update_render_result(BL::RenderResult& b_rr,
 	                          BL::RenderLayer& b_rlay,
 	                          RenderTile& rtile);
-	void update_render_tile(RenderTile& rtile);
+	void update_render_tile(RenderTile& rtile, bool highlight);
 
 	/* interactive updates */
 	void synchronize();
@@ -147,7 +147,7 @@ protected:
 	                                   BL::RenderLayer& b_rlay,
 	                                   RenderTile& rtile,
 	                                   bool do_update_only);
-	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
+	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
 
 	int builtin_image_frame(const string &builtin_name);
 	void builtin_image_info(const string &builtin_name,
@@ -156,15 +156,18 @@ protected:
 	                        int &width,
 	                        int &height,
 	                        int &depth,
-	                        int &channels);
+	                        int &channels,
+	                        bool &free_cache);
 	bool builtin_image_pixels(const string &builtin_name,
 	                          void *builtin_data,
 	                          unsigned char *pixels,
-	                          const size_t pixels_size);
+	                          const size_t pixels_size,
+	                          const bool free_cache);
 	bool builtin_image_float_pixels(const string &builtin_name,
 	                                void *builtin_data,
 	                                float *pixels,
-	                                const size_t pixels_size);
+	                                const size_t pixels_size,
+	                                const bool free_cache);
 
 	/* Update tile manager to reflect resumable render settings. */
 	void update_resumable_tile_manager(int num_samples);
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 3f04f11aab4..bdbab1006c0 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -521,6 +521,19 @@ static ShaderNode *add_node(Scene *scene,
 		}
 		node = hair;
 	}
+	else if(b_node.is_a(&RNA_ShaderNodeBsdfPrincipled)) {
+		BL::ShaderNodeBsdfPrincipled b_principled_node(b_node);
+		PrincipledBsdfNode *principled = new PrincipledBsdfNode();
+		switch (b_principled_node.distribution()) {
+			case BL::ShaderNodeBsdfPrincipled::distribution_GGX:
+				principled->distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+				break;
+			case BL::ShaderNodeBsdfPrincipled::distribution_MULTI_GGX:
+				principled->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+				break;
+		}
+		node = principled;
+	}
 	else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) {
 		node = new TranslucentBsdfNode();
 	}
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 3b071bf0e7d..3a00384458a 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -210,10 +210,9 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render,
 	   scene->need_motion() == Scene::MOTION_NONE ||
 	   scene->camera->motion_position == Camera::MOTION_POSITION_CENTER)
 	{
-		sync_objects(b_v3d);
+		sync_objects();
 	}
 	sync_motion(b_render,
-	            b_v3d,
 	            b_override,
 	            width, height,
 	            python_thread_state);
@@ -330,6 +329,9 @@ void BlenderSync::sync_integrator()
 			integrator->ao_bounces = get_int(cscene, "ao_bounces_render");
 		}
 	}
+	else {
+		integrator->ao_bounces = 0;
+	}
 
 	if(integrator->modified(previntegrator))
 		integrator->tag_update(scene);
@@ -480,6 +482,137 @@ void BlenderSync::sync_images()
 	}
 }
 
+/* Passes */
+PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+#define MAP_PASS(passname, passtype) if(name == passname) return passtype;
+	/* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+	MAP_PASS("Combined", PASS_COMBINED);
+	MAP_PASS("Depth", PASS_DEPTH);
+	MAP_PASS("Mist", PASS_MIST);
+	MAP_PASS("Normal", PASS_NORMAL);
+	MAP_PASS("IndexOB", PASS_OBJECT_ID);
+	MAP_PASS("UV", PASS_UV);
+	MAP_PASS("Vector", PASS_MOTION);
+	MAP_PASS("IndexMA", PASS_MATERIAL_ID);
+
+	MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
+	MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
+	MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
+	MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT);
+
+	MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
+	MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
+	MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
+	MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT);
+
+	MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
+	MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
+	MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
+	MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR);
+
+	MAP_PASS("Emit", PASS_EMISSION);
+	MAP_PASS("Env", PASS_BACKGROUND);
+	MAP_PASS("AO", PASS_AO);
+	MAP_PASS("Shadow", PASS_SHADOW);
+
+#ifdef __KERNEL_DEBUG__
+	MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES);
+	MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES);
+	MAP_PASS("Debug BVH Intersections", PASS_BVH_INTERSECTIONS);
+	MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
+#endif
+#undef MAP_PASS
+
+	return PASS_NONE;
+}
+
+int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+	if(name.substr(0, 10) != "Denoising ") {
+		return -1;
+	}
+	name = name.substr(10);
+
+#define MAP_PASS(passname, offset) if(name == passname) return offset;
+	MAP_PASS("Normal", DENOISING_PASS_NORMAL);
+	MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
+	MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
+	MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
+	MAP_PASS("Depth", DENOISING_PASS_DEPTH);
+	MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
+	MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
+	MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
+	MAP_PASS("Image", DENOISING_PASS_COLOR);
+	MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+#undef MAP_PASS
+
+	return -1;
+}
+
+array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
+                                            BL::SceneRenderLayer& b_srlay,
+                                            const SessionParams &session_params)
+{
+	array<Pass> passes;
+	Pass::add(PASS_COMBINED, passes);
+
+	if(!session_params.device.advanced_shading) {
+		return passes;
+	}
+
+	/* loop over passes */
+	BL::RenderLayer::passes_iterator b_pass_iter;
+
+	for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) {
+		BL::RenderPass b_pass(*b_pass_iter);
+		PassType pass_type = get_pass_type(b_pass);
+
+		if(pass_type == PASS_MOTION && scene->integrator->motion_blur)
+			continue;
+		if(pass_type != PASS_NONE)
+			Pass::add(pass_type, passes);
+	}
+
+	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+	if(get_boolean(crp, "denoising_store_passes") &&
+	   get_boolean(crp, "use_denoising") &&
+	   !session_params.progressive_refine) {
+		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image",           3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
+	}
+#ifdef __KERNEL_DEBUG__
+	if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
+		b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
+	}
+	if(get_boolean(crp, "pass_debug_bvh_traversed_instances")) {
+		b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes);
+	}
+	if(get_boolean(crp, "pass_debug_bvh_intersections")) {
+		b_engine.add_pass("Debug BVH Intersections", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_BVH_INTERSECTIONS, passes);
+	}
+	if(get_boolean(crp, "pass_debug_ray_bounces")) {
+		b_engine.add_pass("Debug Ray Bounces", 1, "X", b_srlay.name().c_str());
+		Pass::add(PASS_RAY_BOUNCES, passes);
+	}
+#endif
+
+	return passes;
+}
+
 /* Scene Parameters */
 
 SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 36bedc505af..4ec46424b5a 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -67,6 +67,9 @@ public:
 	               void **python_thread_state,
 	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
+	array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
+	                               BL::SceneRenderLayer& b_srlay,
+	                               const SessionParams &session_params);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings& b_render,
 	                 BL::Object& b_override,
@@ -93,13 +96,15 @@ public:
 	                                      Camera *cam,
 	                                      int width, int height);
 
+	static PassType get_pass_type(BL::RenderPass& b_pass);
+	static int get_denoising_pass(BL::RenderPass& b_pass);
+
 private:
 	/* sync */
 	void sync_lamps(bool update_all);
 	void sync_materials(bool update_all);
-	void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f);
+	void sync_objects(float motion_time = 0.0f);
 	void sync_motion(BL::RenderSettings& b_render,
-	                 BL::SpaceView3D& b_v3d,
 	                 BL::Object& b_override,
 	                 int width, int height,
 	                 void **python_thread_state);
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index abdbb6be0fd..363e19f7a20 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
                                       bool calc_undeformed,
                                       Mesh::SubdivisionType subdivision_type)
 {
-	bool subsurf_mod_show_render;
-	bool subsurf_mod_show_viewport;
+	bool subsurf_mod_show_render = false;
+	bool subsurf_mod_show_viewport = false;
 
 	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
@@ -299,7 +299,7 @@ static inline uint get_layer(const BL::Array<int, 20>& array)
 	for(uint i = 0; i < 20; i++)
 		if(array[i])
 			layer |= (1 << i);
-	
+
 	return layer;
 }
 
@@ -434,7 +434,7 @@ static inline string get_string(PointerRNA& ptr, const char *name)
 	string str(cstr);
 	if(cstr != cstrbuf)
 		MEM_freeN(cstr);
-	
+
 	return str;
 }
 
@@ -451,7 +451,7 @@ static inline string blender_absolute_path(BL::BlendData& b_data,
 {
 	if(path.size() >= 2 && path[0] == '/' && path[1] == '/') {
 		string dirname;
-		
+
 		if(b_id.library()) {
 			BL::ID b_library_id(b_id.library());
 			dirname = blender_absolute_path(b_data,
@@ -544,7 +544,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object& b_ob)
 				return b_smd.domain_settings();
 		}
 	}
-	
+
 	return BL::SmokeDomainSettings(PointerRNA_NULL);
 }
 
@@ -816,4 +816,3 @@ protected:
 CCL_NAMESPACE_END
 
 #endif /* __BLENDER_UTIL_H__ */
-
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 4701d75350a..6078db5a8ca 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -8,6 +8,8 @@ set(INC_SYS
 
 set(SRC
 	bvh.cpp
+	bvh2.cpp
+	bvh4.cpp
 	bvh_binning.cpp
 	bvh_build.cpp
 	bvh_node.cpp
@@ -18,6 +20,8 @@ set(SRC
 
 set(SRC_HEADERS
 	bvh.h
+	bvh2.h
+	bvh4.h
 	bvh_binning.h
 	bvh_build.h
 	bvh_node.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 58348d16746..33143e2d8aa 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -15,45 +15,32 @@
  * limitations under the License.
  */
 
+#include "bvh/bvh.h"
+
 #include "render/mesh.h"
 #include "render/object.h"
-#include "render/scene.h"
-#include "render/curves.h"
 
-#include "bvh/bvh.h"
+#include "bvh/bvh2.h"
+#include "bvh/bvh4.h"
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_node.h"
-#include "bvh/bvh_params.h"
-#include "bvh/bvh_unaligned.h"
 
-#include "util/util_debug.h"
 #include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
 #include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_types.h"
-#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
 /* Pack Utility */
 
-struct BVHStackEntry
+BVHStackEntry::BVHStackEntry(const BVHNode *n, int i)
+    : node(n), idx(i)
 {
-	const BVHNode *node;
-	int idx;
-
-	BVHStackEntry(const BVHNode* n = 0, int i = 0)
-	: node(n), idx(i)
-	{
-	}
+}
 
-	int encodeIdx() const
-	{
-		return (node->is_leaf())? ~idx: idx;
-	}
-};
+int BVHStackEntry::encodeIdx() const
+{
+	return (node->is_leaf())? ~idx: idx;
+}
 
 /* BVH */
 
@@ -65,9 +52,9 @@ BVH::BVH(const BVHParams& params_, const vector<Object*>& objects_)
 BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 {
 	if(params.use_qbvh)
-		return new QBVH(params, objects);
+		return new BVH4(params, objects);
 	else
-		return new BinaryBVH(params, objects);
+		return new BVH2(params, objects);
 }
 
 /* Building */
@@ -418,832 +405,4 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	}
 }
 
-/* Regular BVH */
-
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	return node0->is_unaligned || node1->is_unaligned;
-}
-
-BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_)
-: BVH(params_, objects_)
-{
-}
-
-void BinaryBVH::pack_leaf(const BVHStackEntry& e,
-                          const LeafNode *leaf)
-{
-	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
-	float4 data[BVH_NODE_LEAF_SIZE];
-	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
-		/* object */
-		data[0].x = __int_as_float(~(leaf->lo));
-		data[0].y = __int_as_float(0);
-	}
-	else {
-		/* triangle */
-		data[0].x = __int_as_float(leaf->lo);
-		data[0].y = __int_as_float(leaf->hi);
-	}
-	data[0].z = __uint_as_float(leaf->visibility);
-	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
-	}
-
-	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
-}
-
-void BinaryBVH::pack_inner(const BVHStackEntry& e,
-                           const BVHStackEntry& e0,
-                           const BVHStackEntry& e1)
-{
-	if(e0.node->is_unaligned || e1.node->is_unaligned) {
-		pack_unaligned_inner(e, e0, e1);
-	} else {
-		pack_aligned_inner(e, e0, e1);
-	}
-}
-
-void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e,
-                                   const BVHStackEntry& e0,
-                                   const BVHStackEntry& e1)
-{
-	pack_aligned_node(e.idx,
-	                  e0.node->bounds, e1.node->bounds,
-	                  e0.encodeIdx(), e1.encodeIdx(),
-	                  e0.node->visibility, e1.node->visibility);
-}
-
-void BinaryBVH::pack_aligned_node(int idx,
-                                  const BoundBox& b0,
-                                  const BoundBox& b1,
-                                  int c0, int c1,
-                                  uint visibility0, uint visibility1)
-{
-	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
-	assert(c0 < 0 || c0 < pack.nodes.size());
-	assert(c1 < 0 || c1 < pack.nodes.size());
-
-	int4 data[BVH_NODE_SIZE] = {
-		make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED,
-		          visibility1 & ~PATH_RAY_NODE_UNALIGNED,
-		          c0, c1),
-		make_int4(__float_as_int(b0.min.x),
-		          __float_as_int(b1.min.x),
-		          __float_as_int(b0.max.x),
-		          __float_as_int(b1.max.x)),
-		make_int4(__float_as_int(b0.min.y),
-		          __float_as_int(b1.min.y),
-		          __float_as_int(b0.max.y),
-		          __float_as_int(b1.max.y)),
-		make_int4(__float_as_int(b0.min.z),
-		          __float_as_int(b1.min.z),
-		          __float_as_int(b0.max.z),
-		          __float_as_int(b1.max.z)),
-	};
-
-	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
-}
-
-void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                     const BVHStackEntry& e0,
-                                     const BVHStackEntry& e1)
-{
-	pack_unaligned_node(e.idx,
-	                    e0.node->get_aligned_space(),
-	                    e1.node->get_aligned_space(),
-	                    e0.node->bounds,
-	                    e1.node->bounds,
-	                    e0.encodeIdx(), e1.encodeIdx(),
-	                    e0.node->visibility, e1.node->visibility);
-}
-
-void BinaryBVH::pack_unaligned_node(int idx,
-                                    const Transform& aligned_space0,
-                                    const Transform& aligned_space1,
-                                    const BoundBox& bounds0,
-                                    const BoundBox& bounds1,
-                                    int c0, int c1,
-                                    uint visibility0, uint visibility1)
-{
-	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
-	assert(c0 < 0 || c0 < pack.nodes.size());
-	assert(c1 < 0 || c1 < pack.nodes.size());
-
-	float4 data[BVH_UNALIGNED_NODE_SIZE];
-	Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
-	                                                        aligned_space0);
-	Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
-	                                                        aligned_space1);
-	data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
-	                      __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
-	                      __int_as_float(c0),
-	                      __int_as_float(c1));
-
-	data[1] = space0.x;
-	data[2] = space0.y;
-	data[3] = space0.z;
-	data[4] = space1.x;
-	data[5] = space1.y;
-	data[6] = space1.z;
-
-	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
-}
-
-void BinaryBVH::pack_nodes(const BVHNode *root)
-{
-	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-	assert(num_leaf_nodes <= num_nodes);
-	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-	size_t node_size;
-	if(params.use_unaligned_nodes) {
-		const size_t num_unaligned_nodes =
-		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
-		node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
-		            (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
-	}
-	else {
-		node_size = num_inner_nodes * BVH_NODE_SIZE;
-	}
-	/* Resize arrays */
-	pack.nodes.clear();
-	pack.leaf_nodes.clear();
-	/* For top level BVH, first merge existing BVH's so we know the offsets. */
-	if(params.top_level) {
-		pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
-	}
-	else {
-		pack.nodes.resize(node_size);
-		pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
-	}
-
-	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-	vector<BVHStackEntry> stack;
-	stack.reserve(BVHParams::MAX_DEPTH*2);
-	if(root->is_leaf()) {
-		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-	}
-	else {
-		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_bvh_is_unaligned(root)
-		                       ? BVH_UNALIGNED_NODE_SIZE
-		                       : BVH_NODE_SIZE;
-	}
-
-	while(stack.size()) {
-		BVHStackEntry e = stack.back();
-		stack.pop_back();
-
-		if(e.node->is_leaf()) {
-			/* leaf node */
-			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
-			pack_leaf(e, leaf);
-		}
-		else {
-			/* innner node */
-			int idx[2];
-			for(int i = 0; i < 2; ++i) {
-				if(e.node->get_child(i)->is_leaf()) {
-					idx[i] = nextLeafNodeIdx++;
-				}
-				else {
-					idx[i] = nextNodeIdx;
-					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
-					                       ? BVH_UNALIGNED_NODE_SIZE
-					                       : BVH_NODE_SIZE;
-				}
-			}
-
-			stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
-			stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
-
-			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
-		}
-	}
-	assert(node_size == nextNodeIdx);
-	/* root index to start traversal at, to handle case of single leaf node */
-	pack.root_index = (root->is_leaf())? -1: 0;
-}
-
-void BinaryBVH::refit_nodes()
-{
-	assert(!params.top_level);
-
-	BoundBox bbox = BoundBox::empty;
-	uint visibility = 0;
-	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
-}
-
-void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
-{
-	if(leaf) {
-		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
-		const int4 *data = &pack.leaf_nodes[idx];
-		const int c0 = data[0].x;
-		const int c1 = data[0].y;
-		/* refit leaf node */
-		for(int prim = c0; prim < c1; prim++) {
-			int pidx = pack.prim_index[prim];
-			int tob = pack.prim_object[prim];
-			Object *ob = objects[tob];
-
-			if(pidx == -1) {
-				/* object instance */
-				bbox.grow(ob->bounds);
-			}
-			else {
-				/* primitives */
-				const Mesh *mesh = ob->mesh;
-
-				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
-					/* curves */
-					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
-					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
-					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
-					visibility |= PATH_RAY_CURVE;
-
-					/* motion curves */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->curve_keys.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *key_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
-						}
-					}
-				}
-				else {
-					/* triangles */
-					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
-					const float3 *vpos = &mesh->verts[0];
-
-					triangle.bounds_grow(vpos, bbox);
-
-					/* motion triangles */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->verts.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *vert_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
-						}
-					}
-				}
-			}
-
-			visibility |= ob->visibility;
-		}
-
-		/* TODO(sergey): De-duplicate with pack_leaf(). */
-		float4 leaf_data[BVH_NODE_LEAF_SIZE];
-		leaf_data[0].x = __int_as_float(c0);
-		leaf_data[0].y = __int_as_float(c1);
-		leaf_data[0].z = __uint_as_float(visibility);
-		leaf_data[0].w = __uint_as_float(data[0].w);
-		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
-	}
-	else {
-		assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
-
-		const int4 *data = &pack.nodes[idx];
-		const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
-		const int c0 = data[0].z;
-		const int c1 = data[0].w;
-		/* refit inner node, set bbox from children */
-		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
-		uint visibility0 = 0, visibility1 = 0;
-
-		refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
-		refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
-
-		if(is_unaligned) {
-			Transform aligned_space = transform_identity();
-			pack_unaligned_node(idx,
-			                    aligned_space, aligned_space,
-			                    bbox0, bbox1,
-			                    c0, c1,
-			                    visibility0,
-			                    visibility1);
-		}
-		else {
-			pack_aligned_node(idx,
-			                  bbox0, bbox1,
-			                  c0, c1,
-			                  visibility0,
-			                  visibility1);
-		}
-
-		bbox.grow(bbox0);
-		bbox.grow(bbox1);
-		visibility = visibility0|visibility1;
-	}
-}
-
-/* QBVH */
-
-/* Can we avoid this somehow or make more generic?
- *
- * Perhaps we can merge nodes in actual tree and make our
- * life easier all over the place.
- */
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	bool has_unaligned = false;
-	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned;
-	}
-	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned;
-		has_unaligned |= node0->get_child(1)->is_unaligned;
-	}
-	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned;
-	}
-	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned;
-		has_unaligned |= node1->get_child(1)->is_unaligned;
-	}
-	return has_unaligned;
-}
-
-QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_)
-: BVH(params_, objects_)
-{
-	params.use_qbvh = true;
-}
-
-void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
-{
-	float4 data[BVH_QNODE_LEAF_SIZE];
-	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
-		/* object */
-		data[0].x = __int_as_float(~(leaf->lo));
-		data[0].y = __int_as_float(0);
-	}
-	else {
-		/* triangle */
-		data[0].x = __int_as_float(leaf->lo);
-		data[0].y = __int_as_float(leaf->hi);
-	}
-	data[0].z = __uint_as_float(leaf->visibility);
-	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
-	}
-
-	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
-}
-
-void QBVH::pack_inner(const BVHStackEntry& e,
-                      const BVHStackEntry *en,
-                      int num)
-{
-	bool has_unaligned = false;
-	/* Check whether we have to create unaligned node or all nodes are aligned
-	 * and we can cut some corner here.
-	 */
-	if(params.use_unaligned_nodes) {
-		for(int i = 0; i < num; i++) {
-			if(en[i].node->is_unaligned) {
-				has_unaligned = true;
-				break;
-			}
-		}
-	}
-	if(has_unaligned) {
-		/* There's no unaligned children, pack into AABB node. */
-		pack_unaligned_inner(e, en, num);
-	}
-	else {
-		/* Create unaligned node with orientation transform for each of the
-		 * children.
-		 */
-		pack_aligned_inner(e, en, num);
-	}
-}
-
-void QBVH::pack_aligned_inner(const BVHStackEntry& e,
-                              const BVHStackEntry *en,
-                              int num)
-{
-	BoundBox bounds[4];
-	int child[4];
-	for(int i = 0; i < num; ++i) {
-		bounds[i] = en[i].node->bounds;
-		child[i] = en[i].encodeIdx();
-	}
-	pack_aligned_node(e.idx,
-	                  bounds,
-	                  child,
-	                  e.node->visibility,
-	                  e.node->time_from,
-	                  e.node->time_to,
-	                  num);
-}
-
-void QBVH::pack_aligned_node(int idx,
-                             const BoundBox *bounds,
-                             const int *child,
-                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
-                             const int num)
-{
-	float4 data[BVH_QNODE_SIZE];
-	memset(data, 0, sizeof(data));
-
-	data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-	data[0].y = time_from;
-	data[0].z = time_to;
-
-	for(int i = 0; i < num; i++) {
-		float3 bb_min = bounds[i].min;
-		float3 bb_max = bounds[i].max;
-
-		data[1][i] = bb_min.x;
-		data[2][i] = bb_max.x;
-		data[3][i] = bb_min.y;
-		data[4][i] = bb_max.y;
-		data[5][i] = bb_min.z;
-		data[6][i] = bb_max.z;
-
-		data[7][i] = __int_as_float(child[i]);
-	}
-
-	for(int i = num; i < 4; i++) {
-		/* We store BB which would never be recorded as intersection
-		 * so kernel might safely assume there are always 4 child nodes.
-		 */
-		data[1][i] = FLT_MAX;
-		data[2][i] = -FLT_MAX;
-
-		data[3][i] = FLT_MAX;
-		data[4][i] = -FLT_MAX;
-
-		data[5][i] = FLT_MAX;
-		data[6][i] = -FLT_MAX;
-
-		data[7][i] = __int_as_float(0);
-	}
-
-	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
-}
-
-void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                const BVHStackEntry *en,
-                                int num)
-{
-	Transform aligned_space[4];
-	BoundBox bounds[4];
-	int child[4];
-	for(int i = 0; i < num; ++i) {
-		aligned_space[i] = en[i].node->get_aligned_space();
-		bounds[i] = en[i].node->bounds;
-		child[i] = en[i].encodeIdx();
-	}
-	pack_unaligned_node(e.idx,
-	                    aligned_space,
-	                    bounds,
-	                    child,
-	                    e.node->visibility,
-	                    e.node->time_from,
-	                    e.node->time_to,
-	                    num);
-}
-
-void QBVH::pack_unaligned_node(int idx,
-                               const Transform *aligned_space,
-                               const BoundBox *bounds,
-                               const int *child,
-                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
-                               const int num)
-{
-	float4 data[BVH_UNALIGNED_QNODE_SIZE];
-	memset(data, 0, sizeof(data));
-
-	data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-	data[0].y = time_from;
-	data[0].z = time_to;
-
-	for(int i = 0; i < num; i++) {
-		Transform space = BVHUnaligned::compute_node_transform(
-		        bounds[i],
-		        aligned_space[i]);
-
-		data[1][i] = space.x.x;
-		data[2][i] = space.x.y;
-		data[3][i] = space.x.z;
-
-		data[4][i] = space.y.x;
-		data[5][i] = space.y.y;
-		data[6][i] = space.y.z;
-
-		data[7][i] = space.z.x;
-		data[8][i] = space.z.y;
-		data[9][i] = space.z.z;
-
-		data[10][i] = space.x.w;
-		data[11][i] = space.y.w;
-		data[12][i] = space.z.w;
-
-		data[13][i] = __int_as_float(child[i]);
-	}
-
-	for(int i = num; i < 4; i++) {
-		/* We store BB which would never be recorded as intersection
-		 * so kernel might safely assume there are always 4 child nodes.
-		 */
-
-		data[1][i] = 1.0f;
-		data[2][i] = 0.0f;
-		data[3][i] = 0.0f;
-
-		data[4][i] = 0.0f;
-		data[5][i] = 0.0f;
-		data[6][i] = 0.0f;
-
-		data[7][i] = 0.0f;
-		data[8][i] = 0.0f;
-		data[9][i] = 0.0f;
-
-		data[10][i] = -FLT_MAX;
-		data[11][i] = -FLT_MAX;
-		data[12][i] = -FLT_MAX;
-
-		data[13][i] = __int_as_float(0);
-	}
-
-	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void QBVH::pack_nodes(const BVHNode *root)
-{
-	/* Calculate size of the arrays required. */
-	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
-	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-	assert(num_leaf_nodes <= num_nodes);
-	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-	size_t node_size;
-	if(params.use_unaligned_nodes) {
-		const size_t num_unaligned_nodes =
-		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
-		node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
-		            (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
-	}
-	else {
-		node_size = num_inner_nodes * BVH_QNODE_SIZE;
-	}
-	/* Resize arrays. */
-	pack.nodes.clear();
-	pack.leaf_nodes.clear();
-	/* For top level BVH, first merge existing BVH's so we know the offsets. */
-	if(params.top_level) {
-		pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
-	}
-	else {
-		pack.nodes.resize(node_size);
-		pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
-	}
-
-	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-	vector<BVHStackEntry> stack;
-	stack.reserve(BVHParams::MAX_DEPTH*2);
-	if(root->is_leaf()) {
-		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-	}
-	else {
-		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_qbvh_is_unaligned(root)
-		                       ? BVH_UNALIGNED_QNODE_SIZE
-		                       : BVH_QNODE_SIZE;
-	}
-
-	while(stack.size()) {
-		BVHStackEntry e = stack.back();
-		stack.pop_back();
-
-		if(e.node->is_leaf()) {
-			/* leaf node */
-			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
-			pack_leaf(e, leaf);
-		}
-		else {
-			/* Inner node. */
-			const BVHNode *node = e.node;
-			const BVHNode *node0 = node->get_child(0);
-			const BVHNode *node1 = node->get_child(1);
-			/* Collect nodes. */
-			const BVHNode *nodes[4];
-			int numnodes = 0;
-			if(node0->is_leaf()) {
-				nodes[numnodes++] = node0;
-			}
-			else {
-				nodes[numnodes++] = node0->get_child(0);
-				nodes[numnodes++] = node0->get_child(1);
-			}
-			if(node1->is_leaf()) {
-				nodes[numnodes++] = node1;
-			}
-			else {
-				nodes[numnodes++] = node1->get_child(0);
-				nodes[numnodes++] = node1->get_child(1);
-			}
-			/* Push entries on the stack. */
-			for(int i = 0; i < numnodes; ++i) {
-				int idx;
-				if(nodes[i]->is_leaf()) {
-					idx = nextLeafNodeIdx++;
-				}
-				else {
-					idx = nextNodeIdx;
-					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
-					                       ? BVH_UNALIGNED_QNODE_SIZE
-					                       : BVH_QNODE_SIZE;
-				}
-				stack.push_back(BVHStackEntry(nodes[i], idx));
-			}
-			/* Set node. */
-			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
-		}
-	}
-	assert(node_size == nextNodeIdx);
-	/* Root index to start traversal at, to handle case of single leaf node. */
-	pack.root_index = (root->is_leaf())? -1: 0;
-}
-
-void QBVH::refit_nodes()
-{
-	assert(!params.top_level);
-
-	BoundBox bbox = BoundBox::empty;
-	uint visibility = 0;
-	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
-}
-
-void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
-{
-	if(leaf) {
-		int4 *data = &pack.leaf_nodes[idx];
-		int4 c = data[0];
-		/* Refit leaf node. */
-		for(int prim = c.x; prim < c.y; prim++) {
-			int pidx = pack.prim_index[prim];
-			int tob = pack.prim_object[prim];
-			Object *ob = objects[tob];
-
-			if(pidx == -1) {
-				/* Object instance. */
-				bbox.grow(ob->bounds);
-			}
-			else {
-				/* Primitives. */
-				const Mesh *mesh = ob->mesh;
-
-				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
-					/* Curves. */
-					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
-					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
-					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
-					visibility |= PATH_RAY_CURVE;
-
-					/* Motion curves. */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->curve_keys.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *key_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
-						}
-					}
-				}
-				else {
-					/* Triangles. */
-					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
-					const float3 *vpos = &mesh->verts[0];
-
-					triangle.bounds_grow(vpos, bbox);
-
-					/* Motion triangles. */
-					if(mesh->use_motion_blur) {
-						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-						if(attr) {
-							size_t mesh_size = mesh->verts.size();
-							size_t steps = mesh->motion_steps - 1;
-							float3 *vert_steps = attr->data_float3();
-
-							for(size_t i = 0; i < steps; i++)
-								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
-						}
-					}
-				}
-			}
-
-			visibility |= ob->visibility;
-		}
-
-		/* TODO(sergey): This is actually a copy of pack_leaf(),
-		 * but this chunk of code only knows actual data and has
-		 * no idea about BVHNode.
-		 *
-		 * Would be nice to de-duplicate code, but trying to make
-		 * making code more general ends up in much nastier code
-		 * in my opinion so far.
-		 *
-		 * Same applies to the inner nodes case below.
-		 */
-		float4 leaf_data[BVH_QNODE_LEAF_SIZE];
-		leaf_data[0].x = __int_as_float(c.x);
-		leaf_data[0].y = __int_as_float(c.y);
-		leaf_data[0].z = __uint_as_float(visibility);
-		leaf_data[0].w = __uint_as_float(c.w);
-		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
-	}
-	else {
-		int4 *data = &pack.nodes[idx];
-		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
-		int4 c;
-		if(is_unaligned) {
-			c = data[13];
-		}
-		else {
-			c = data[7];
-		}
-		/* Refit inner node, set bbox from children. */
-		BoundBox child_bbox[4] = {BoundBox::empty,
-		                          BoundBox::empty,
-		                          BoundBox::empty,
-		                          BoundBox::empty};
-		uint child_visibility[4] = {0};
-		int num_nodes = 0;
-
-		for(int i = 0; i < 4; ++i) {
-			if(c[i] != 0) {
-				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
-				           child_bbox[i], child_visibility[i]);
-				++num_nodes;
-				bbox.grow(child_bbox[i]);
-				visibility |= child_visibility[i];
-			}
-		}
-
-		if(is_unaligned) {
-			Transform aligned_space[4] = {transform_identity(),
-			                              transform_identity(),
-			                              transform_identity(),
-			                              transform_identity()};
-			pack_unaligned_node(idx,
-			                    aligned_space,
-			                    child_bbox,
-			                    &c[0],
-			                    visibility,
-			                    0.0f,
-			                    1.0f,
-			                    4);
-		}
-		else {
-			pack_aligned_node(idx,
-			                  child_bbox,
-			                  &c[0],
-			                  visibility,
-			                  0.0f,
-			                  1.0f,
-			                  4);
-		}
-	}
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 60bc62ee6e4..7bac6112fd9 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -33,15 +33,8 @@ class LeafNode;
 class Object;
 class Progress;
 
-#define BVH_NODE_SIZE	4
-#define BVH_NODE_LEAF_SIZE	1
-#define BVH_QNODE_SIZE	8
-#define BVH_QNODE_LEAF_SIZE	1
-#define BVH_ALIGN		4096
-#define TRI_NODE_SIZE	3
-
-#define BVH_UNALIGNED_NODE_SIZE 7
-#define BVH_UNALIGNED_QNODE_SIZE 14
+#define BVH_ALIGN     4096
+#define TRI_NODE_SIZE 3
 
 /* Packed BVH
  *
@@ -54,7 +47,7 @@ struct PackedBVH {
 	/* BVH leaf nodes storage. */
 	array<int4> leaf_nodes;
 	/* object index to BVH node index mapping for instances */
-	array<int> object_node; 
+	array<int> object_node;
 	/* Mapping from primitive index to index in triangle array. */
 	array<uint> prim_tri_index;
 	/* Continuous storage of triangle vertices. */
@@ -110,95 +103,16 @@ protected:
 	virtual void refit_nodes() = 0;
 };
 
-/* Binary BVH
- *
- * Typical BVH with each node having two children. */
-
-class BinaryBVH : public BVH {
-protected:
-	/* constructor */
-	friend class BVH;
-	BinaryBVH(const BVHParams& params, const vector<Object*>& objects);
-
-	/* pack */
-	void pack_nodes(const BVHNode *root);
-
-	void pack_leaf(const BVHStackEntry& e,
-	               const LeafNode *leaf);
-	void pack_inner(const BVHStackEntry& e,
-	                const BVHStackEntry& e0,
-	                const BVHStackEntry& e1);
-
-	void pack_aligned_inner(const BVHStackEntry& e,
-	                        const BVHStackEntry& e0,
-	                        const BVHStackEntry& e1);
-	void pack_aligned_node(int idx,
-	                       const BoundBox& b0,
-	                       const BoundBox& b1,
-	                       int c0, int c1,
-	                       uint visibility0, uint visibility1);
-
-	void pack_unaligned_inner(const BVHStackEntry& e,
-	                          const BVHStackEntry& e0,
-	                          const BVHStackEntry& e1);
-	void pack_unaligned_node(int idx,
-	                         const Transform& aligned_space0,
-	                         const Transform& aligned_space1,
-	                         const BoundBox& b0,
-	                         const BoundBox& b1,
-	                         int c0, int c1,
-	                         uint visibility0, uint visibility1);
-
-	/* refit */
-	void refit_nodes();
-	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
-};
-
-/* QBVH
- *
- * Quad BVH, with each node having four children, to use with SIMD instructions. */
+/* Pack Utility */
+struct BVHStackEntry
+{
+	const BVHNode *node;
+	int idx;
 
-class QBVH : public BVH {
-protected:
-	/* constructor */
-	friend class BVH;
-	QBVH(const BVHParams& params, const vector<Object*>& objects);
-
-	/* pack */
-	void pack_nodes(const BVHNode *root);
-
-	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
-	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
-
-	void pack_aligned_inner(const BVHStackEntry& e,
-	                        const BVHStackEntry *en,
-	                        int num);
-	void pack_aligned_node(int idx,
-	                       const BoundBox *bounds,
-	                       const int *child,
-	                       const uint visibility,
-	                       const float time_from,
-	                       const float time_to,
-	                       const int num);
-
-	void pack_unaligned_inner(const BVHStackEntry& e,
-	                          const BVHStackEntry *en,
-	                          int num);
-	void pack_unaligned_node(int idx,
-	                         const Transform *aligned_space,
-	                         const BoundBox *bounds,
-	                         const int *child,
-	                         const uint visibility,
-	                         const float time_from,
-	                         const float time_to,
-	                         const int num);
-
-	/* refit */
-	void refit_nodes();
-	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+	BVHStackEntry(const BVHNode *n = 0, int i = 0);
+	int encodeIdx() const;
 };
 
 CCL_NAMESPACE_END
 
 #endif /* __BVH_H__ */
-
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
new file mode 100644
index 00000000000..340ba7dcf53
--- /dev/null
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -0,0 +1,364 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh2.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool node_bvh_is_unaligned(const BVHNode *node)
+{
+	const BVHNode *node0 = node->get_child(0),
+	              *node1 = node->get_child(1);
+	return node0->is_unaligned || node1->is_unaligned;
+}
+
+BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH2::pack_leaf(const BVHStackEntry& e,
+                     const LeafNode *leaf)
+{
+	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
+	float4 data[BVH_NODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+		/* object */
+		data[0].x = __int_as_float(~(leaf->lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
+		/* triangle */
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
+	}
+	data[0].z = __uint_as_float(leaf->visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+}
+
+void BVH2::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry& e0,
+                      const BVHStackEntry& e1)
+{
+	if(e0.node->is_unaligned || e1.node->is_unaligned) {
+		pack_unaligned_inner(e, e0, e1);
+	} else {
+		pack_aligned_inner(e, e0, e1);
+	}
+}
+
+void BVH2::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry& e0,
+                              const BVHStackEntry& e1)
+{
+	pack_aligned_node(e.idx,
+	                  e0.node->bounds, e1.node->bounds,
+	                  e0.encodeIdx(), e1.encodeIdx(),
+	                  e0.node->visibility, e1.node->visibility);
+}
+
+void BVH2::pack_aligned_node(int idx,
+                             const BoundBox& b0,
+                             const BoundBox& b1,
+                             int c0, int c1,
+                             uint visibility0, uint visibility1)
+{
+	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
+	assert(c0 < 0 || c0 < pack.nodes.size());
+	assert(c1 < 0 || c1 < pack.nodes.size());
+
+	int4 data[BVH_NODE_SIZE] = {
+		make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED,
+		          visibility1 & ~PATH_RAY_NODE_UNALIGNED,
+		          c0, c1),
+		make_int4(__float_as_int(b0.min.x),
+		          __float_as_int(b1.min.x),
+		          __float_as_int(b0.max.x),
+		          __float_as_int(b1.max.x)),
+		make_int4(__float_as_int(b0.min.y),
+		          __float_as_int(b1.min.y),
+		          __float_as_int(b0.max.y),
+		          __float_as_int(b1.max.y)),
+		make_int4(__float_as_int(b0.min.z),
+		          __float_as_int(b1.min.z),
+		          __float_as_int(b0.max.z),
+		          __float_as_int(b1.max.z)),
+	};
+
+	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
+}
+
+void BVH2::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry& e0,
+                                const BVHStackEntry& e1)
+{
+	pack_unaligned_node(e.idx,
+	                    e0.node->get_aligned_space(),
+	                    e1.node->get_aligned_space(),
+	                    e0.node->bounds,
+	                    e1.node->bounds,
+	                    e0.encodeIdx(), e1.encodeIdx(),
+	                    e0.node->visibility, e1.node->visibility);
+}
+
+void BVH2::pack_unaligned_node(int idx,
+                               const Transform& aligned_space0,
+                               const Transform& aligned_space1,
+                               const BoundBox& bounds0,
+                               const BoundBox& bounds1,
+                               int c0, int c1,
+                               uint visibility0, uint visibility1)
+{
+	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
+	assert(c0 < 0 || c0 < pack.nodes.size());
+	assert(c1 < 0 || c1 < pack.nodes.size());
+
+	float4 data[BVH_UNALIGNED_NODE_SIZE];
+	Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
+	                                                        aligned_space0);
+	Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
+	                                                        aligned_space1);
+	data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
+	                      __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
+	                      __int_as_float(c0),
+	                      __int_as_float(c1));
+
+	data[1] = space0.x;
+	data[2] = space0.y;
+	data[3] = space0.z;
+	data[4] = space1.x;
+	data[5] = space1.y;
+	data[6] = space1.z;
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
+}
+
+void BVH2::pack_nodes(const BVHNode *root)
+{
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
+		            (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_NODE_SIZE;
+	}
+	/* Resize arrays */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
+	if(params.top_level) {
+		pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
+	}
+	else {
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
+	}
+
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+	vector<BVHStackEntry> stack;
+	stack.reserve(BVHParams::MAX_DEPTH*2);
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_bvh_is_unaligned(root)
+		                       ? BVH_UNALIGNED_NODE_SIZE
+		                       : BVH_NODE_SIZE;
+	}
+
+	while(stack.size()) {
+		BVHStackEntry e = stack.back();
+		stack.pop_back();
+
+		if(e.node->is_leaf()) {
+			/* leaf node */
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+			pack_leaf(e, leaf);
+		}
+		else {
+			/* innner node */
+			int idx[2];
+			for(int i = 0; i < 2; ++i) {
+				if(e.node->get_child(i)->is_leaf()) {
+					idx[i] = nextLeafNodeIdx++;
+				}
+				else {
+					idx[i] = nextNodeIdx;
+					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+					                       ? BVH_UNALIGNED_NODE_SIZE
+					                       : BVH_NODE_SIZE;
+				}
+			}
+
+			stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
+			stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
+
+			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
+		}
+	}
+	assert(node_size == nextNodeIdx);
+	/* root index to start traversal at, to handle case of single leaf node */
+	pack.root_index = (root->is_leaf())? -1: 0;
+}
+
+void BVH2::refit_nodes()
+{
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
+		const int4 *data = &pack.leaf_nodes[idx];
+		const int c0 = data[0].x;
+		const int c1 = data[0].y;
+		/* refit leaf node */
+		for(int prim = c0; prim < c1; prim++) {
+			int pidx = pack.prim_index[prim];
+			int tob = pack.prim_object[prim];
+			Object *ob = objects[tob];
+
+			if(pidx == -1) {
+				/* object instance */
+				bbox.grow(ob->bounds);
+			}
+			else {
+				/* primitives */
+				const Mesh *mesh = ob->mesh;
+
+				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+					/* curves */
+					int str_offset = (params.top_level)? mesh->curve_offset: 0;
+					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+					visibility |= PATH_RAY_CURVE;
+
+					/* motion curves */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->curve_keys.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *key_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++)
+								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+						}
+					}
+				}
+				else {
+					/* triangles */
+					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
+					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+					const float3 *vpos = &mesh->verts[0];
+
+					triangle.bounds_grow(vpos, bbox);
+
+					/* motion triangles */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->verts.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *vert_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++)
+								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+						}
+					}
+				}
+			}
+
+			visibility |= ob->visibility;
+		}
+
+		/* TODO(sergey): De-duplicate with pack_leaf(). */
+		float4 leaf_data[BVH_NODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c0);
+		leaf_data[0].y = __int_as_float(c1);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(data[0].w);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+	}
+	else {
+		assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
+
+		const int4 *data = &pack.nodes[idx];
+		const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		const int c0 = data[0].z;
+		const int c1 = data[0].w;
+		/* refit inner node, set bbox from children */
+		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
+		uint visibility0 = 0, visibility1 = 0;
+
+		refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
+		refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
+
+		if(is_unaligned) {
+			Transform aligned_space = transform_identity();
+			pack_unaligned_node(idx,
+			                    aligned_space, aligned_space,
+			                    bbox0, bbox1,
+			                    c0, c1,
+			                    visibility0,
+			                    visibility1);
+		}
+		else {
+			pack_aligned_node(idx,
+			                  bbox0, bbox1,
+			                  c0, c1,
+			                  visibility0,
+			                  visibility1);
+		}
+
+		bbox.grow(bbox0);
+		bbox.grow(bbox1);
+		visibility = visibility0|visibility1;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h
new file mode 100644
index 00000000000..df65ddca5b7
--- /dev/null
+++ b/intern/cycles/bvh/bvh2.h
@@ -0,0 +1,87 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH2_H__
+#define __BVH2_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_NODE_SIZE           4
+#define BVH_NODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_NODE_SIZE 7
+
+/* BVH2
+ *
+ * Typical BVH with each node having two children.
+ */
+class BVH2 : public BVH {
+protected:
+	/* constructor */
+	friend class BVH;
+	BVH2(const BVHParams& params, const vector<Object*>& objects);
+
+	/* pack */
+	void pack_nodes(const BVHNode *root);
+
+	void pack_leaf(const BVHStackEntry& e,
+	               const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e,
+	                const BVHStackEntry& e0,
+	                const BVHStackEntry& e1);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry& e0,
+	                        const BVHStackEntry& e1);
+	void pack_aligned_node(int idx,
+	                       const BoundBox& b0,
+	                       const BoundBox& b1,
+	                       int c0, int c1,
+	                       uint visibility0, uint visibility1);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry& e0,
+	                          const BVHStackEntry& e1);
+	void pack_unaligned_node(int idx,
+	                         const Transform& aligned_space0,
+	                         const Transform& aligned_space1,
+	                         const BoundBox& b0,
+	                         const BoundBox& b1,
+	                         int c0, int c1,
+	                         uint visibility0, uint visibility1);
+
+	/* refit */
+	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH2_H__ */
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
new file mode 100644
index 00000000000..5034ab811d5
--- /dev/null
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -0,0 +1,516 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh4.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Can we avoid this somehow or make more generic?
+ *
+ * Perhaps we can merge nodes in actual tree and make our
+ * life easier all over the place.
+ */
+static bool node_qbvh_is_unaligned(const BVHNode *node)
+{
+	const BVHNode *node0 = node->get_child(0),
+	              *node1 = node->get_child(1);
+	bool has_unaligned = false;
+	if(node0->is_leaf()) {
+		has_unaligned |= node0->is_unaligned;
+	}
+	else {
+		has_unaligned |= node0->get_child(0)->is_unaligned;
+		has_unaligned |= node0->get_child(1)->is_unaligned;
+	}
+	if(node1->is_leaf()) {
+		has_unaligned |= node1->is_unaligned;
+	}
+	else {
+		has_unaligned |= node1->get_child(0)->is_unaligned;
+		has_unaligned |= node1->get_child(1)->is_unaligned;
+	}
+	return has_unaligned;
+}
+
+BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+	params.use_qbvh = true;
+}
+
+void BVH4::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+	float4 data[BVH_QNODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+		/* object */
+		data[0].x = __int_as_float(~(leaf->lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
+		/* triangle */
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
+	}
+	data[0].z = __uint_as_float(leaf->visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+}
+
+void BVH4::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry *en,
+                      int num)
+{
+	bool has_unaligned = false;
+	/* Check whether we have to create unaligned node or all nodes are aligned
+	 * and we can cut some corner here.
+	 */
+	if(params.use_unaligned_nodes) {
+		for(int i = 0; i < num; i++) {
+			if(en[i].node->is_unaligned) {
+				has_unaligned = true;
+				break;
+			}
+		}
+	}
+	if(has_unaligned) {
+		/* There's no unaligned children, pack into AABB node. */
+		pack_unaligned_inner(e, en, num);
+	}
+	else {
+		/* Create unaligned node with orientation transform for each of the
+		 * children.
+		 */
+		pack_aligned_inner(e, en, num);
+	}
+}
+
+void BVH4::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry *en,
+                              int num)
+{
+	BoundBox bounds[4];
+	int child[4];
+	for(int i = 0; i < num; ++i) {
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_aligned_node(e.idx,
+	                  bounds,
+	                  child,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
+	                  num);
+}
+
+void BVH4::pack_aligned_node(int idx,
+                             const BoundBox *bounds,
+                             const int *child,
+                             const uint visibility,
+                             const float time_from,
+                             const float time_to,
+                             const int num)
+{
+	float4 data[BVH_QNODE_SIZE];
+	memset(data, 0, sizeof(data));
+
+	data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+	data[0].y = time_from;
+	data[0].z = time_to;
+
+	for(int i = 0; i < num; i++) {
+		float3 bb_min = bounds[i].min;
+		float3 bb_max = bounds[i].max;
+
+		data[1][i] = bb_min.x;
+		data[2][i] = bb_max.x;
+		data[3][i] = bb_min.y;
+		data[4][i] = bb_max.y;
+		data[5][i] = bb_min.z;
+		data[6][i] = bb_max.z;
+
+		data[7][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 4; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+		data[1][i] = FLT_MAX;
+		data[2][i] = -FLT_MAX;
+
+		data[3][i] = FLT_MAX;
+		data[4][i] = -FLT_MAX;
+
+		data[5][i] = FLT_MAX;
+		data[6][i] = -FLT_MAX;
+
+		data[7][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
+}
+
+void BVH4::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry *en,
+                                int num)
+{
+	Transform aligned_space[4];
+	BoundBox bounds[4];
+	int child[4];
+	for(int i = 0; i < num; ++i) {
+		aligned_space[i] = en[i].node->get_aligned_space();
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_unaligned_node(e.idx,
+	                    aligned_space,
+	                    bounds,
+	                    child,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
+	                    num);
+}
+
+void BVH4::pack_unaligned_node(int idx,
+                               const Transform *aligned_space,
+                               const BoundBox *bounds,
+                               const int *child,
+                               const uint visibility,
+                               const float time_from,
+                               const float time_to,
+                               const int num)
+{
+	float4 data[BVH_UNALIGNED_QNODE_SIZE];
+	memset(data, 0, sizeof(data));
+
+	data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+	data[0].y = time_from;
+	data[0].z = time_to;
+
+	for(int i = 0; i < num; i++) {
+		Transform space = BVHUnaligned::compute_node_transform(
+		        bounds[i],
+		        aligned_space[i]);
+
+		data[1][i] = space.x.x;
+		data[2][i] = space.x.y;
+		data[3][i] = space.x.z;
+
+		data[4][i] = space.y.x;
+		data[5][i] = space.y.y;
+		data[6][i] = space.y.z;
+
+		data[7][i] = space.z.x;
+		data[8][i] = space.z.y;
+		data[9][i] = space.z.z;
+
+		data[10][i] = space.x.w;
+		data[11][i] = space.y.w;
+		data[12][i] = space.z.w;
+
+		data[13][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 4; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+
+		data[1][i] = 1.0f;
+		data[2][i] = 0.0f;
+		data[3][i] = 0.0f;
+
+		data[4][i] = 0.0f;
+		data[5][i] = 0.0f;
+		data[6][i] = 0.0f;
+
+		data[7][i] = 0.0f;
+		data[8][i] = 0.0f;
+		data[9][i] = 0.0f;
+
+		data[10][i] = -FLT_MAX;
+		data[11][i] = -FLT_MAX;
+		data[12][i] = -FLT_MAX;
+
+		data[13][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH4::pack_nodes(const BVHNode *root)
+{
+	/* Calculate size of the arrays required. */
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
+		            (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_QNODE_SIZE;
+	}
+	/* Resize arrays. */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
+	if(params.top_level) {
+		pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
+	}
+
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+	vector<BVHStackEntry> stack;
+	stack.reserve(BVHParams::MAX_DEPTH*2);
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_qbvh_is_unaligned(root)
+		                       ? BVH_UNALIGNED_QNODE_SIZE
+		                       : BVH_QNODE_SIZE;
+	}
+
+	while(stack.size()) {
+		BVHStackEntry e = stack.back();
+		stack.pop_back();
+
+		if(e.node->is_leaf()) {
+			/* leaf node */
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+			pack_leaf(e, leaf);
+		}
+		else {
+			/* Inner node. */
+			const BVHNode *node = e.node;
+			const BVHNode *node0 = node->get_child(0);
+			const BVHNode *node1 = node->get_child(1);
+			/* Collect nodes. */
+			const BVHNode *nodes[4];
+			int numnodes = 0;
+			if(node0->is_leaf()) {
+				nodes[numnodes++] = node0;
+			}
+			else {
+				nodes[numnodes++] = node0->get_child(0);
+				nodes[numnodes++] = node0->get_child(1);
+			}
+			if(node1->is_leaf()) {
+				nodes[numnodes++] = node1;
+			}
+			else {
+				nodes[numnodes++] = node1->get_child(0);
+				nodes[numnodes++] = node1->get_child(1);
+			}
+			/* Push entries on the stack. */
+			for(int i = 0; i < numnodes; ++i) {
+				int idx;
+				if(nodes[i]->is_leaf()) {
+					idx = nextLeafNodeIdx++;
+				}
+				else {
+					idx = nextNodeIdx;
+					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+					                       ? BVH_UNALIGNED_QNODE_SIZE
+					                       : BVH_QNODE_SIZE;
+				}
+				stack.push_back(BVHStackEntry(nodes[i], idx));
+			}
+			/* Set node. */
+			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
+		}
+	}
+	assert(node_size == nextNodeIdx);
+	/* Root index to start traversal at, to handle case of single leaf node. */
+	pack.root_index = (root->is_leaf())? -1: 0;
+}
+
+void BVH4::refit_nodes()
+{
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx];
+		int4 c = data[0];
+		/* Refit leaf node. */
+		for(int prim = c.x; prim < c.y; prim++) {
+			int pidx = pack.prim_index[prim];
+			int tob = pack.prim_object[prim];
+			Object *ob = objects[tob];
+
+			if(pidx == -1) {
+				/* Object instance. */
+				bbox.grow(ob->bounds);
+			}
+			else {
+				/* Primitives. */
+				const Mesh *mesh = ob->mesh;
+
+				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+					/* Curves. */
+					int str_offset = (params.top_level)? mesh->curve_offset: 0;
+					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+					visibility |= PATH_RAY_CURVE;
+
+					/* Motion curves. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->curve_keys.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *key_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++)
+								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+						}
+					}
+				}
+				else {
+					/* Triangles. */
+					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
+					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+					const float3 *vpos = &mesh->verts[0];
+
+					triangle.bounds_grow(vpos, bbox);
+
+					/* Motion triangles. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->verts.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *vert_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++)
+								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+						}
+					}
+				}
+			}
+
+			visibility |= ob->visibility;
+		}
+
+		/* TODO(sergey): This is actually a copy of pack_leaf(),
+		 * but this chunk of code only knows actual data and has
+		 * no idea about BVHNode.
+		 *
+		 * Would be nice to de-duplicate code, but trying to make
+		 * making code more general ends up in much nastier code
+		 * in my opinion so far.
+		 *
+		 * Same applies to the inner nodes case below.
+		 */
+		float4 leaf_data[BVH_QNODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c.x);
+		leaf_data[0].y = __int_as_float(c.y);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(c.w);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+	}
+	else {
+		int4 *data = &pack.nodes[idx];
+		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		int4 c;
+		if(is_unaligned) {
+			c = data[13];
+		}
+		else {
+			c = data[7];
+		}
+		/* Refit inner node, set bbox from children. */
+		BoundBox child_bbox[4] = {BoundBox::empty,
+		                          BoundBox::empty,
+		                          BoundBox::empty,
+		                          BoundBox::empty};
+		uint child_visibility[4] = {0};
+		int num_nodes = 0;
+
+		for(int i = 0; i < 4; ++i) {
+			if(c[i] != 0) {
+				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+				           child_bbox[i], child_visibility[i]);
+				++num_nodes;
+				bbox.grow(child_bbox[i]);
+				visibility |= child_visibility[i];
+			}
+		}
+
+		if(is_unaligned) {
+			Transform aligned_space[4] = {transform_identity(),
+			                              transform_identity(),
+			                              transform_identity(),
+			                              transform_identity()};
+			pack_unaligned_node(idx,
+			                    aligned_space,
+			                    child_bbox,
+			                    &c[0],
+			                    visibility,
+			                    0.0f,
+			                    1.0f,
+			                    4);
+		}
+		else {
+			pack_aligned_node(idx,
+			                  child_bbox,
+			                  &c[0],
+			                  visibility,
+			                  0.0f,
+			                  1.0f,
+			                  4);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
new file mode 100644
index 00000000000..310909a37e1
--- /dev/null
+++ b/intern/cycles/bvh/bvh4.h
@@ -0,0 +1,87 @@
+/*
+ * Adapted from code copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH4_H__
+#define __BVH4_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_QNODE_SIZE           8
+#define BVH_QNODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_QNODE_SIZE 14
+
+/* BVH4
+ *
+ * Quad BVH, with each node having four children, to use with SIMD instructions.
+ */
+class BVH4 : public BVH {
+protected:
+	/* constructor */
+	friend class BVH;
+	BVH4(const BVHParams& params, const vector<Object*>& objects);
+
+	/* pack */
+	void pack_nodes(const BVHNode *root);
+
+	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry *en,
+	                        int num);
+	void pack_aligned_node(int idx,
+	                       const BoundBox *bounds,
+	                       const int *child,
+	                       const uint visibility,
+	                       const float time_from,
+	                       const float time_to,
+	                       const int num);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry *en,
+	                          int num);
+	void pack_unaligned_node(int idx,
+	                         const Transform *aligned_space,
+	                         const BoundBox *bounds,
+	                         const int *child,
+	                         const uint visibility,
+	                         const float time_from,
+	                         const float time_to,
+	                         const int num);
+
+	/* refit */
+	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH4_H__ */
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index 3226008f511..63a7fc11668 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -17,10 +17,10 @@
 
 //#define __KERNEL_SSE__
 
-#include <stdlib.h>
-
 #include "bvh/bvh_binning.h"
 
+#include <stdlib.h>
+
 #include "util/util_algorithm.h"
 #include "util/util_boundbox.h"
 #include "util/util_types.h"
diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h
index 285f9c56a62..c2e259b1696 100644
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -111,5 +111,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif
-
+#endif  /* __BVH_BINNING_H__ */
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 95c71b54da0..1880964355c 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-#include "bvh/bvh_binning.h"
 #include "bvh/bvh_build.h"
+
+#include "bvh/bvh_binning.h"
 #include "bvh/bvh_node.h"
 #include "bvh/bvh_params.h"
 #include "bvh_split.h"
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 5733708050d..7b245139819 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -20,17 +20,17 @@
 
 #include <float.h>
 
-#include "bvh/bvh.h"
-#include "bvh/bvh_binning.h"
+#include "bvh/bvh_params.h"
 #include "bvh/bvh_unaligned.h"
 
-#include "util/util_boundbox.h"
 #include "util/util_task.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class Boundbox;
 class BVHBuildTask;
+class BVHNode;
 class BVHSpatialSplitBuildTask;
 class BVHParams;
 class InnerNode;
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 4f788c66797..4237c62ab5b 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
+#include "bvh/bvh_node.h"
+
 #include "bvh/bvh.h"
 #include "bvh/bvh_build.h"
-#include "bvh/bvh_node.h"
 
 #include "util/util_debug.h"
 #include "util/util_vector.h"
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 60511b4b012..1c875f5a524 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -19,7 +19,6 @@
 #define __BVH_NODE_H__
 
 #include "util/util_boundbox.h"
-#include "util/util_debug.h"
 #include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 9795a7a4350..7dd699b33a4 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -246,4 +246,3 @@ struct BVHSpatialStorage {
 CCL_NAMESPACE_END
 
 #endif /* __BVH_PARAMS_H__ */
-
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index d29629c0279..3a01061b285 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-#include "bvh/bvh_build.h"
 #include "bvh/bvh_sort.h"
 
+#include "bvh/bvh_build.h"
+
 #include "util/util_algorithm.h"
 #include "util/util_debug.h"
 #include "util/util_task.h"
diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h
index b49ca02eb60..936401d8607 100644
--- a/intern/cycles/bvh/bvh_sort.h
+++ b/intern/cycles/bvh/bvh_sort.h
@@ -18,8 +18,11 @@
 #ifndef __BVH_SORT_H__
 #define __BVH_SORT_H__
 
+#include <cstddef>
+
 CCL_NAMESPACE_BEGIN
 
+class BVHReference;
 class BVHUnaligned;
 struct Transform;
 
@@ -33,4 +36,3 @@ void bvh_reference_sort(int start,
 CCL_NAMESPACE_END
 
 #endif /* __BVH_SORT_H__ */
-
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index b10d69a495d..c55ba40b565 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-#include "bvh/bvh_build.h"
 #include "bvh/bvh_split.h"
+
+#include "bvh/bvh_build.h"
 #include "bvh/bvh_sort.h"
 
 #include "render/mesh.h"
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index ef227d20ea9..b522a8f3e10 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #include "bvh/bvh_unaligned.h"
 
 #include "render/mesh.h"
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
index f41bae79e2b..c3ece051cd5 100644
--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -78,4 +78,3 @@ protected:
 CCL_NAMESPACE_END
 
 #endif /* __BVH_UNALIGNED_H__ */
-
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 403a0540963..df88b91f5ac 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -135,13 +135,5 @@ if(CYCLES_STANDALONE_REPOSITORY)
 
 	unset(_lib_DIR)
 else()
-	if(WIN32)
-		set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src/windows)
-		set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src)
-	else()
-		set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src)
-		set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src)
-	endif()
-	set(GFLAGS_NAMESPACE "gflags")
 	set(LLVM_LIBRARIES ${LLVM_LIBRARY})
 endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6ef2aa1caad..74ec57ddf74 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
 	device.cpp
 	device_cpu.cpp
 	device_cuda.cpp
+	device_denoising.cpp
 	device_multi.cpp
 	device_opencl.cpp
 	device_split_kernel.cpp
@@ -48,6 +49,7 @@ endif()
 
 set(SRC_HEADERS
 	device.h
+	device_denoising.h
 	device_memory.h
 	device_intern.h
 	device_network.h
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 968af447e29..a54bb77f9f3 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -66,6 +66,10 @@ std::ostream& operator <<(std::ostream &os,
 	   << string_from_bool(requested_features.use_patch_evaluation) << std::endl;
 	os << "Use Transparent Shadows: "
 	   << string_from_bool(requested_features.use_transparent) << std::endl;
+	os << "Use Principled BSDF: "
+	   << string_from_bool(requested_features.use_principled) << std::endl;
+	os << "Use Denoising: "
+	   << string_from_bool(requested_features.use_denoising) << std::endl;
 	return os;
 }
 
@@ -400,4 +404,16 @@ void Device::free_memory()
 	devices.free_memory();
 }
 
+
+device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
+ : device(device)
+{
+	ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+	device->mem_free_sub_ptr(ptr);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ac06e561795..b3b693c630c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -124,6 +124,12 @@ public:
 	/* Use various shadow tricks, such as shadow catcher. */
 	bool use_shadow_tricks;
 
+	/* Per-uber shader usage flags. */
+	bool use_principled;
+
+	/* Denoising features. */
+	bool use_denoising;
+
 	DeviceRequestedFeatures()
 	{
 		/* TODO(sergey): Find more meaningful defaults. */
@@ -141,6 +147,8 @@ public:
 		use_patch_evaluation = false;
 		use_transparent = false;
 		use_shadow_tricks = false;
+		use_principled = false;
+		use_denoising = false;
 	}
 
 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -158,7 +166,9 @@ public:
 		         use_integrator_branched == requested_features.use_integrator_branched &&
 		         use_patch_evaluation == requested_features.use_patch_evaluation &&
 		         use_transparent == requested_features.use_transparent &&
-		         use_shadow_tricks == requested_features.use_shadow_tricks);
+		         use_shadow_tricks == requested_features.use_shadow_tricks &&
+		         use_principled == requested_features.use_principled &&
+		         use_denoising == requested_features.use_denoising);
 	}
 
 	/* Convert the requested features structure to a build options,
@@ -205,6 +215,12 @@ public:
 		if(!use_shadow_tricks) {
 			build_options += " -D__NO_SHADOW_TRICKS__";
 		}
+		if(!use_principled) {
+			build_options += " -D__NO_PRINCIPLED__";
+		}
+		if(!use_denoising) {
+			build_options += " -D__NO_DENOISING__";
+		}
 		return build_options;
 	}
 };
@@ -220,6 +236,7 @@ struct DeviceDrawParams {
 };
 
 class Device {
+	friend class device_sub_ptr;
 protected:
 	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
 
@@ -229,6 +246,14 @@ protected:
 	/* used for real time display */
 	unsigned int vertex_buffer;
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
+	{
+		/* Only required for devices that implement denoising. */
+		assert(false);
+		return (device_ptr) 0;
+	}
+	virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
 public:
 	virtual ~Device();
 
@@ -257,6 +282,8 @@ public:
 	virtual void mem_zero(device_memory& mem) = 0;
 	virtual void mem_free(device_memory& mem) = 0;
 
+	virtual int mem_address_alignment() { return 16; }
+
 	/* constant memory */
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
@@ -304,6 +331,8 @@ public:
 	/* multi device */
 	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
 	virtual int device_number(Device * /*sub_device*/) { return 0; }
+	virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+	virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
 
 	/* static */
 	static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 3c481bb2b39..18112437b45 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -25,6 +25,7 @@
 #endif
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
@@ -34,6 +35,8 @@
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
 
+#include "kernel/filter/filter.h"
+
 #include "kernel/osl/osl_shader.h"
 #include "kernel/osl/osl_globals.h"
 
@@ -53,91 +56,108 @@ CCL_NAMESPACE_BEGIN
 
 class CPUDevice;
 
-class CPUSplitKernel : public DeviceSplitKernel {
-	CPUDevice *device;
-public:
-	explicit CPUSplitKernel(CPUDevice *device);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs);
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
-	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
-class CPUDevice : public Device
-{
-	static unordered_map<string, void*> kernel_functions;
-
-	static void register_kernel_function(const char* name, void* func)
+template<typename F>
+class KernelFunctions {
+public:
+	KernelFunctions()
 	{
-		kernel_functions[name] = func;
+		kernel = (F)NULL;
 	}
 
-	static const char* get_arch_name()
+	KernelFunctions(F kernel_default,
+	                F kernel_sse2,
+	                F kernel_sse3,
+	                F kernel_sse41,
+	                F kernel_avx,
+	                F kernel_avx2)
 	{
+		const char *architecture_name = "default";
+		kernel = kernel_default;
+
+		/* Silence potential warnings about unused variables
+		 * when compiling without some architectures. */
+		(void)kernel_sse2;
+		(void)kernel_sse3;
+		(void)kernel_sse41;
+		(void)kernel_avx;
+		(void)kernel_avx2;
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2()) {
-			return "cpu_avx2";
+			architecture_name = "AVX2";
+			kernel = kernel_avx2;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
-			return "cpu_avx";
+			architecture_name = "AVX";
+			kernel = kernel_avx;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 		if(system_cpu_support_sse41()) {
-			return "cpu_sse41";
+			architecture_name = "SSE4.1";
+			kernel = kernel_sse41;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
-			return "cpu_sse3";
+			architecture_name = "SSE3";
+			kernel = kernel_sse3;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 		if(system_cpu_support_sse2()) {
-			return "cpu_sse2";
+			architecture_name = "SSE2";
+			kernel = kernel_sse2;
 		}
-		else
 #endif
-		{
-			return "cpu";
+
+		if(strstr(architecture_name, logged_architecture) != 0) {
+			VLOG(1) << "Will be using " << architecture_name << " kernels.";
+			logged_architecture = architecture_name;
 		}
 	}
 
-	template<typename F>
-	static F get_kernel_function(string name)
-	{
-		name = string("kernel_") + get_arch_name() + "_" + name;
-
-		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+	inline F operator()() const {
+		assert(kernel);
+		return kernel;
+	}
+protected:
+	F kernel;
+};
 
-		if(it == kernel_functions.end()) {
-			assert(!"kernel function not found");
-			return NULL;
-		}
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
 
-		return (F)it->second;
-	}
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
 
-	friend class CPUSplitKernel;
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
 
+class CPUDevice : public Device
+{
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -149,77 +169,92 @@ public:
 	bool use_split_kernel;
 
 	DeviceRequestedFeatures requested_features;
-	
+
+	KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)>   path_trace_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_half_float_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+
+	KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+	KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)>               filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_detect_outliers_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_combine_halves_kernel;
+
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
+
+	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
+
+	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+	                       ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
+	                       ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
+	unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+	      KERNEL_NAME_EVAL(cpu, name), \
+	      KERNEL_NAME_EVAL(cpu_sse2, name), \
+	      KERNEL_NAME_EVAL(cpu_sse3, name), \
+	      KERNEL_NAME_EVAL(cpu_sse41, name), \
+	      KERNEL_NAME_EVAL(cpu_avx, name), \
+	      KERNEL_NAME_EVAL(cpu_avx2, name)
+
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
-	: Device(info, stats, background)
+	: Device(info, stats, background),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+	  REGISTER_KERNEL(path_trace),
+	  REGISTER_KERNEL(convert_to_half_float),
+	  REGISTER_KERNEL(convert_to_byte),
+	  REGISTER_KERNEL(shader),
+	  REGISTER_KERNEL(filter_divide_shadow),
+	  REGISTER_KERNEL(filter_get_feature),
+	  REGISTER_KERNEL(filter_detect_outliers),
+	  REGISTER_KERNEL(filter_combine_halves),
+	  REGISTER_KERNEL(filter_nlm_calc_difference),
+	  REGISTER_KERNEL(filter_nlm_blur),
+	  REGISTER_KERNEL(filter_nlm_calc_weight),
+	  REGISTER_KERNEL(filter_nlm_update_output),
+	  REGISTER_KERNEL(filter_nlm_normalize),
+	  REGISTER_KERNEL(filter_construct_transform),
+	  REGISTER_KERNEL(filter_nlm_construct_gramian),
+	  REGISTER_KERNEL(filter_finalize),
+	  REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
 	{
 
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
-
-		/* do now to avoid thread issues */
-		system_cpu_support_sse2();
-		system_cpu_support_sse3();
-		system_cpu_support_sse41();
-		system_cpu_support_avx();
-		system_cpu_support_avx2();
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			VLOG(1) << "Will be using AVX2 kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			VLOG(1) << "Will be using AVX kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			VLOG(1) << "Will be using SSE4.1 kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			VLOG(1) << "Will be using SSE3kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			VLOG(1) << "Will be using SSE2 kernels.";
-		}
-		else
-#endif
-		{
-			VLOG(1) << "Will be using regular kernels.";
-		}
-
 		use_split_kernel = DebugFlags().cpu.split_kernel;
 		if(use_split_kernel) {
 			VLOG(1) << "Will be using split kernel.";
 		}
 
-		kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+		REGISTER_SPLIT_KERNEL(path_init);
+		REGISTER_SPLIT_KERNEL(scene_intersect);
+		REGISTER_SPLIT_KERNEL(lamp_emission);
+		REGISTER_SPLIT_KERNEL(do_volume);
+		REGISTER_SPLIT_KERNEL(queue_enqueue);
+		REGISTER_SPLIT_KERNEL(indirect_background);
+		REGISTER_SPLIT_KERNEL(shader_setup);
+		REGISTER_SPLIT_KERNEL(shader_sort);
+		REGISTER_SPLIT_KERNEL(shader_eval);
+		REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		REGISTER_SPLIT_KERNEL(subsurface_scatter);
+		REGISTER_SPLIT_KERNEL(direct_lighting);
+		REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+		REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+		REGISTER_SPLIT_KERNEL(enqueue_inactive);
+		REGISTER_SPLIT_KERNEL(next_iteration_setup);
+		REGISTER_SPLIT_KERNEL(indirect_subsurface);
+		REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
 	}
 
 	~CPUDevice()
@@ -273,13 +308,17 @@ public:
 			if(!mem.data_pointer) {
 				free((void*)mem.device_pointer);
 			}
-
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		kernel_const_copy(&kernel_globals, name, host, size);
@@ -326,13 +365,8 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
-			if(!use_split_kernel) {
-				thread_path_trace(*task);
-			}
-			else {
-				thread_path_trace_split(*task);
-			}
+		if(task->type == DeviceTask::RENDER) {
+			thread_render(*task);
 		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
@@ -349,117 +383,335 @@ public:
 		}
 	};
 
-	void thread_path_trace(DeviceTask& task)
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
 	{
-		if(task_pool.canceled()) {
-			if(task.need_finish_queue == false)
-				return;
+		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
 		}
 
-		KernelGlobals kg = thread_kernel_globals_init();
-		RenderTile tile;
+		return true;
+	}
 
-		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		int4 rect = task->rect;
+		int   r   = task->nlm_state.r;
+		int   f   = task->nlm_state.f;
+		float a   = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		int w = align_up(rect.z-rect.x, 4);
+		int h = rect.w-rect.y;
+
+		float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+		float *difference     = (float*) task->nlm_state.temporary_2_ptr;
+		float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
+
+		memset(weightAccum, 0, sizeof(float)*w*h);
+		memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) guide_ptr,
+			                                    (float*) variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    w, 0,
+			                                    a, k_2);
+
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+
+			filter_nlm_update_output_kernel()(dx, dy,
+			                                  blurDifference,
+			                                  (float*) image_ptr,
+			                                  (float*) out_ptr,
+			                                  weightAccum,
+			                                  local_rect,
+			                                  w, f);
+		}
+
+		int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+		filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			path_trace_kernel = kernel_cpu_avx2_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			path_trace_kernel = kernel_cpu_avx_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			path_trace_kernel = kernel_cpu_sse41_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			path_trace_kernel = kernel_cpu_sse3_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			path_trace_kernel = kernel_cpu_sse2_path_trace;
+		return true;
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		for(int y = 0; y < task->filter_area.w; y++) {
+			for(int x = 0; x < task->filter_area.z; x++) {
+				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+				                                    x + task->filter_area.x,
+				                                    y + task->filter_area.y,
+				                                    y*task->filter_area.z + x,
+				                                    (float*) task->storage.transform.device_pointer,
+				                                    (int*)   task->storage.rank.device_pointer,
+				                                    &task->rect.x,
+				                                    task->buffer.pass_stride,
+				                                    task->radius,
+				                                    task->pca_threshold);
+			}
 		}
-		else
-#endif
-		{
-			path_trace_kernel = kernel_cpu_path_trace;
+		return true;
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		float *difference     = (float*) task->reconstruction_state.temporary_1_ptr;
+		float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) color_ptr,
+			                                    (float*) color_variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    task->buffer.w,
+			                                    task->buffer.pass_stride,
+			                                    1.0f,
+			                                    task->nlm_k_2);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+			filter_nlm_construct_gramian_kernel()(dx, dy,
+			                                      blurDifference,
+			                                      (float*)  task->buffer.mem.device_pointer,
+			                                      (float*)  task->storage.transform.device_pointer,
+			                                      (int*)    task->storage.rank.device_pointer,
+			                                      (float*)  task->storage.XtWX.device_pointer,
+			                                      (float3*) task->storage.XtWY.device_pointer,
+			                                      local_rect,
+			                                      &task->reconstruction_state.filter_rect.x,
+			                                      task->buffer.w,
+			                                      task->buffer.h,
+			                                      4,
+			                                      task->buffer.pass_stride);
+		}
+		for(int y = 0; y < task->filter_area.w; y++) {
+			for(int x = 0; x < task->filter_area.z; x++) {
+				filter_finalize_kernel()(x,
+				                         y,
+				                         y*task->filter_area.z + x,
+				                         task->buffer.w,
+				                         task->buffer.h,
+				                         (float*)  output_ptr,
+				                         (int*)    task->storage.rank.device_pointer,
+				                         (float*)  task->storage.XtWX.device_pointer,
+				                         (float3*) task->storage.XtWY.device_pointer,
+				                         &task->reconstruction_state.buffer_params.x,
+				                         task->render_buffer.samples);
+			}
 		}
+		return true;
+	}
 
-		while(task.acquire_tile(this, tile)) {
-			float *render_buffer = (float*)tile.buffer;
-			uint *rng_state = (uint*)tile.rng_state;
-			int start_sample = tile.start_sample;
-			int end_sample = tile.start_sample + tile.num_samples;
-
-			for(int sample = start_sample; sample < end_sample; sample++) {
-				if(task.get_cancel() || task_pool.canceled()) {
-					if(task.need_finish_queue == false)
-						break;
-				}
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask * /*task*/)
+	{
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x; x < rect.z; x++) {
+				filter_combine_halves_kernel()(x, y,
+				                               (float*) mean_ptr,
+				                               (float*) variance_ptr,
+				                               (float*) a_ptr,
+				                               (float*) b_ptr,
+				                               &rect.x,
+				                               r);
+			}
+		}
+		return true;
+	}
 
-				for(int y = tile.y; y < tile.y + tile.h; y++) {
-					for(int x = tile.x; x < tile.x + tile.w; x++) {
-						path_trace_kernel(&kg, render_buffer, rng_state,
-						                  sample, x, y, tile.offset, tile.stride);
-					}
-				}
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_divide_shadow_kernel()(task->render_buffer.samples,
+				                              task->tiles,
+				                              x, y,
+				                              (float*) a_ptr,
+				                              (float*) b_ptr,
+				                              (float*) sample_variance_ptr,
+				                              (float*) sv_variance_ptr,
+				                              (float*) buffer_variance_ptr,
+				                              &task->rect.x,
+				                              task->render_buffer.pass_stride,
+				                              task->render_buffer.denoising_data_offset,
+				                              use_split_kernel);
+			}
+		}
+		return true;
+	}
 
-				tile.sample = sample + 1;
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_get_feature_kernel()(task->render_buffer.samples,
+				                            task->tiles,
+				                            mean_offset,
+				                            variance_offset,
+				                            x, y,
+				                            (float*) mean_ptr,
+				                            (float*) variance_ptr,
+				                            &task->rect.x,
+				                            task->render_buffer.pass_stride,
+				                            task->render_buffer.denoising_data_offset,
+				                            use_split_kernel);
+			}
+		}
+		return true;
+	}
 
-				task.update_progress(&tile, tile.w*tile.h);
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_detect_outliers_kernel()(x, y,
+				                                (float*) image_ptr,
+				                                (float*) variance_ptr,
+				                                (float*) depth_ptr,
+				                                (float*) output_ptr,
+				                                &task->rect.x,
+				                                task->buffer.pass_stride);
 			}
+		}
+		return true;
+	}
 
-			task.release_tile(tile);
+	void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+	{
+		float *render_buffer = (float*)tile.buffer;
+		uint *rng_state = (uint*)tile.rng_state;
+		int start_sample = tile.start_sample;
+		int end_sample = tile.start_sample + tile.num_samples;
 
-			if(task_pool.canceled()) {
+		for(int sample = start_sample; sample < end_sample; sample++) {
+			if(task.get_cancel() || task_pool.canceled()) {
 				if(task.need_finish_queue == false)
 					break;
 			}
+
+			for(int y = tile.y; y < tile.y + tile.h; y++) {
+				for(int x = tile.x; x < tile.x + tile.w; x++) {
+					path_trace_kernel()(kg, render_buffer, rng_state,
+					                    sample, x, y, tile.offset, tile.stride);
+				}
+			}
+
+			tile.sample = sample + 1;
+
+			task.update_progress(&tile, tile.w*tile.h);
 		}
+	}
+
+	void denoise(DeviceTask &task, RenderTile &tile)
+	{
+		tile.sample = tile.start_sample + tile.num_samples;
+
+		DenoisingTask denoising(this);
 
-		thread_kernel_globals_free(&kg);
+		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+		denoising.render_buffer.samples = tile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = tile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+
+		task.update_progress(&tile, tile.w*tile.h);
 	}
 
-	void thread_path_trace_split(DeviceTask& task)
+	void thread_render(DeviceTask& task)
 	{
 		if(task_pool.canceled()) {
 			if(task.need_finish_queue == false)
 				return;
 		}
 
-		RenderTile tile;
-
-		CPUSplitKernel split_kernel(this);
-
 		/* allocate buffer for kernel globals */
-		device_memory kgbuffer;
-		kgbuffer.resize(sizeof(KernelGlobals));
+		device_only_memory<KernelGlobals> kgbuffer;
+		kgbuffer.resize(1);
 		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
 
-		KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
-		*kg = thread_kernel_globals_init();
+		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
 
-		requested_features.max_closure = MAX_CLOSURE;
-		if(!split_kernel.load_kernels(requested_features)) {
-			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
-			mem_free(kgbuffer);
+		CPUSplitKernel *split_kernel = NULL;
+		if(use_split_kernel) {
+			split_kernel = new CPUSplitKernel(this);
+			requested_features.max_closure = MAX_CLOSURE;
+			if(!split_kernel->load_kernels(requested_features)) {
+				thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+				mem_free(kgbuffer);
 
-			return;
+				delete split_kernel;
+				return;
+			}
 		}
 
+		RenderTile tile;
 		while(task.acquire_tile(this, tile)) {
-			device_memory data;
-			split_kernel.path_trace(&task, tile, kgbuffer, data);
+			if(tile.task == RenderTile::PATH_TRACE) {
+				if(use_split_kernel) {
+					device_memory data;
+					split_kernel->path_trace(&task, tile, kgbuffer, data);
+				}
+				else {
+					path_trace(task, tile, kg);
+				}
+			}
+			else if(tile.task == RenderTile::DENOISE) {
+				denoise(task, tile);
+			}
 
 			task.release_tile(tile);
 
@@ -470,7 +722,9 @@ public:
 		}
 
 		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+		kg->~KernelGlobals();
 		mem_free(kgbuffer);
+		delete split_kernel;
 	}
 
 	void thread_film_convert(DeviceTask& task)
@@ -478,86 +732,16 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
-			void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
-			}
-			else
-#endif	
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
-			}
-			else
-#endif		
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
-			if(system_cpu_support_sse3()) {
-				convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
-			}
-			else
-#endif
-			{
-				convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
-			}
-
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-						sample_scale, x, y, task.offset, task.stride);
+					convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+					                               sample_scale, x, y, task.offset, task.stride);
 		}
 		else {
-			void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
-			}
-			else
-#endif		
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
-			}
-			else
-#endif			
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
-			}
-			else
-#endif
-			{
-				convert_to_byte_kernel = kernel_cpu_convert_to_byte;
-			}
-
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-						sample_scale, x, y, task.offset, task.stride);
+					convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+					                         sample_scale, x, y, task.offset, task.stride);
 
 		}
 	}
@@ -569,53 +753,17 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			shader_kernel = kernel_cpu_avx2_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			shader_kernel = kernel_cpu_avx_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-		if(system_cpu_support_sse41()) {
-			shader_kernel = kernel_cpu_sse41_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			shader_kernel = kernel_cpu_sse3_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			shader_kernel = kernel_cpu_sse2_shader;
-		}
-		else
-#endif
-		{
-			shader_kernel = kernel_cpu_shader;
-		}
-
 		for(int sample = 0; sample < task.num_samples; sample++) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel(&kg,
-				              (uint4*)task.shader_input,
-				              (float4*)task.shader_output,
-				              (float*)task.shader_output_luma,
-				              task.shader_eval_type,
-				              task.shader_filter,
-				              x,
-				              task.offset,
-				              sample);
+				shader_kernel()(&kg,
+				                (uint4*)task.shader_input,
+				                (float4*)task.shader_output,
+				                (float*)task.shader_output_luma,
+				                task.shader_eval_type,
+				                task.shader_filter,
+				                x,
+				                task.offset,
+				                sample);
 
 			if(task.get_cancel() || task_pool.canceled())
 				break;
@@ -752,58 +900,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
                                                     device_memory& use_queues_flags,
                                                     device_memory& work_pool_wgs)
 {
-	typedef void(*data_init_t)(KernelGlobals *kg,
-	                           ccl_constant KernelData *data,
-	                           ccl_global void *split_data_buffer,
-	                           int num_elements,
-	                           ccl_global char *ray_state,
-	                           ccl_global uint *rng_state,
-	                           int start_sample,
-	                           int end_sample,
-	                           int sx, int sy, int sw, int sh, int offset, int stride,
-	                           ccl_global int *Queue_index,
-	                           int queuesize,
-	                           ccl_global char *use_queues_flag,
-	                           ccl_global unsigned int *work_pool_wgs,
-	                           unsigned int num_samples,
-	                           ccl_global float *buffer);
-
-	data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-	if(system_cpu_support_avx2()) {
-		data_init = kernel_cpu_avx2_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-	if(system_cpu_support_avx()) {
-		data_init = kernel_cpu_avx_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-	if(system_cpu_support_sse41()) {
-		data_init = kernel_cpu_sse41_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-	if(system_cpu_support_sse3()) {
-		data_init = kernel_cpu_sse3_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-	if(system_cpu_support_sse2()) {
-		data_init = kernel_cpu_sse2_data_init;
-	}
-	else
-#endif
-	{
-		data_init = kernel_cpu_data_init;
-	}
-
 	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
 	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
 
@@ -811,37 +907,38 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
 		for(int x = 0; x < dim.global_size[0]; x++) {
 			kg->global_id = make_int2(x, y);
 
-			data_init((KernelGlobals*)kernel_globals.device_pointer,
-			          (KernelData*)data.device_pointer,
-			          (void*)split_data.device_pointer,
-			          num_global_elements,
-			          (char*)ray_state.device_pointer,
-			          (uint*)rtile.rng_state,
-			          rtile.start_sample,
-			          rtile.start_sample + rtile.num_samples,
-			          rtile.x,
-			          rtile.y,
-			          rtile.w,
-			          rtile.h,
-			          rtile.offset,
-			          rtile.stride,
-			          (int*)queue_index.device_pointer,
-			          dim.global_size[0] * dim.global_size[1],
-			          (char*)use_queues_flags.device_pointer,
-			          (uint*)work_pool_wgs.device_pointer,
-			          rtile.num_samples,
-			          (float*)rtile.buffer);
+			device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+			                           (KernelData*)data.device_pointer,
+			                           (void*)split_data.device_pointer,
+			                           num_global_elements,
+			                           (char*)ray_state.device_pointer,
+			                           (uint*)rtile.rng_state,
+			                           rtile.start_sample,
+			                           rtile.start_sample + rtile.num_samples,
+			                           rtile.x,
+			                           rtile.y,
+			                           rtile.w,
+			                           rtile.h,
+			                           rtile.offset,
+			                           rtile.stride,
+			                           (int*)queue_index.device_pointer,
+			                           dim.global_size[0] * dim.global_size[1],
+			                           (char*)use_queues_flags.device_pointer,
+			                           (uint*)work_pool_wgs.device_pointer,
+			                           rtile.num_samples,
+			                           (float*)rtile.buffer);
 		}
 	}
 
 	return true;
 }
 
-SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
+                                                               const DeviceRequestedFeatures&)
 {
 	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
 
-	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+	kernel->func = device->split_kernels[kernel_name]();
 	if(!kernel->func) {
 		delete kernel;
 		return NULL;
@@ -865,8 +962,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
 	return split_data_buffer_size(kg, num_threads);
 }
 
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index ef283c9d455..3a29538aa13 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
 #include <string.h>
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
 #include "render/buffers.h"
 
+#include "kernel/filter/filter_defines.h"
+
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
@@ -102,7 +105,8 @@ public:
 	                                            device_memory& use_queues_flag,
 	                                            device_memory& work_pool_wgs);
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&);
 	virtual int2 split_kernel_local_size();
 	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
 };
@@ -113,12 +117,13 @@ public:
 	DedicatedTaskPool task_pool;
 	CUdevice cuDevice;
 	CUcontext cuContext;
-	CUmodule cuModule;
+	CUmodule cuModule, cuFilterModule;
 	map<device_ptr, bool> tex_interp_map;
 	map<device_ptr, uint> tex_bindless_map;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
+	CUDASplitKernel *split_kernel;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -169,7 +174,7 @@ public:
 		CUresult result = stmt; \
 		\
 		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -221,6 +226,11 @@ public:
 		cuDevice = 0;
 		cuContext = 0;
 
+		cuModule = 0;
+		cuFilterModule = 0;
+
+		split_kernel = NULL;
+
 		need_bindless_mapping = false;
 
 		/* intialize */
@@ -260,6 +270,8 @@ public:
 	{
 		task_pool.stop();
 
+		delete split_kernel;
+
 		if(info.has_bindless_textures) {
 			tex_free(bindless_mapping);
 		}
@@ -296,7 +308,8 @@ public:
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features, bool split=false)
+	        const DeviceRequestedFeatures& requested_features,
+	        bool filter=false, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@@ -311,7 +324,7 @@ public:
 		                              machine,
 		                              cuda_version,
 		                              include_path.c_str());
-		if(use_adaptive_compilation()) {
+		if(!filter && use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
 		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -359,8 +372,22 @@ public:
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features,
+	                      bool filter=false, bool split=false)
 	{
+		const char *name, *source;
+		if(filter) {
+			name = "filter";
+			source = "filter.cu";
+		}
+		else if(split) {
+			name = "kernel_split";
+			source = "kernel_split.cu";
+		}
+		else {
+			name = "kernel";
+			source = "kernel.cu";
+		}
 		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -368,9 +395,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
-			                                                  : "lib/kernel_sm_%d%d.cubin",
-			                                            major, minor));
+			const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+			                                            name, major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
 				VLOG(1) << "Using precompiled kernel.";
@@ -379,7 +405,7 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features, split);
+		        compile_kernel_get_common_cflags(requested_features, filter, split);
 
 		/* Try to use locally compiled kernel. */
 		const string source_path = path_get("source");
@@ -390,9 +416,8 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
-		                                              : "cycles_kernel_sm%d%d_%s.cubin",
-		                                        major, minor,
+		const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+		                                        name, major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -427,7 +452,7 @@ public:
 		const string kernel = path_join(
 		        path_join(source_path, "kernel"),
 		        path_join("kernels",
-		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+		                  path_join("cuda", source)));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -466,6 +491,16 @@ public:
 
 	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
+		/* TODO(sergey): Support kernels re-load for CUDA devices.
+		 *
+		 * Currently re-loading kernel will invalidate memory pointers,
+		 * causing problems in cuCtxSynchronize.
+		 */
+		if(cuFilterModule && cuModule) {
+			VLOG(1) << "Skipping kernel reload, not currently supported.";
+			return true;
+		}
+
 		/* check if cuda init succeeded */
 		if(cuContext == 0)
 			return false;
@@ -475,11 +510,14 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features, use_split_kernel());
-
+		string cubin = compile_kernel(requested_features, false, use_split_kernel());
 		if(cubin == "")
 			return false;
 
+		string filter_cubin = compile_kernel(requested_features, true, false);
+		if(filter_cubin == "")
+			return false;
+
 		/* open module */
 		cuda_push_context();
 
@@ -494,6 +532,14 @@ public:
 		if(cuda_error_(result, "cuModuleLoad"))
 			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
 
+		if(path_read_text(filter_cubin, cubin_data))
+			result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+		else
+			result = CUDA_ERROR_FILE_NOT_FOUND;
+
+		if(cuda_error_(result, "cuModuleLoad"))
+			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
 		cuda_pop_context();
 
 		return (result == CUDA_SUCCESS);
@@ -576,6 +622,11 @@ public:
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		CUdeviceptr mem;
@@ -876,6 +927,393 @@ public:
 		}
 	}
 
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+	{
+		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
+		}
+
+		mem_copy_to(task->tiles_mem);
+
+		return !have_error();
+	}
+
+#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int threads = (int)sqrt((float)threads_per_block);                                                  \
+			int xblocks = ((w) + threads - 1)/threads;                                                          \
+			int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args)                      \
+			cuda_assert(cuLaunchKernel(func,                \
+			                           xblocks, yblocks, 1, \
+			                           threads, threads, 1, \
+			                           0, 0, args, 0));
+
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		int4 rect = task->rect;
+		int w = align_up(rect.z-rect.x, 4);
+		int h = rect.w-rect.y;
+		int r = task->nlm_state.r;
+		int f = task->nlm_state.f;
+		float a = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+		CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+		cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+		int dx, dy;
+		int4 local_rect;
+		int channel_offset = 0;
+		void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+		void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
+		void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
+		void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			dy = i / (2*r+1) - r;
+			dx = i % (2*r+1) - r;
+			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+		}
+
+		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+		void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterConstructTransform;
+		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+		CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+		                   task->storage.w,
+		                   task->storage.h);
+
+		void *args[] = {&task->buffer.mem.device_pointer,
+		                &task->storage.transform.device_pointer,
+		                &task->storage.rank.device_pointer,
+		                &task->filter_area,
+		                &task->rect,
+		                &task->radius,
+		                &task->pca_threshold,
+		                &task->buffer.pass_stride};
+		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		cuda_push_context();
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+		cuda_assert(cuModuleGetFunction(&cuFinalize,            cuFilterModule, "kernel_cuda_filter_finalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFinalize,            CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+		                   task->reconstruction_state.source_w,
+		                   task->reconstruction_state.source_h);
+
+		CUdeviceptr difference     = task->reconstruction_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		int f = 4;
+		float a = 1.0f;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+
+			void *calc_difference_args[] = {&dx, &dy,
+			                                &color_ptr,
+			                                &color_variance_ptr,
+			                                &difference,
+			                                &local_rect,
+			                                &task->buffer.w,
+			                                &task->buffer.pass_stride,
+			                                &a,
+			                                &task->nlm_k_2};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+			void *blur_args[] = {&difference,
+			                     &blurDifference,
+			                     &local_rect,
+			                     &task->buffer.w,
+			                     &f};
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *calc_weight_args[] = {&blurDifference,
+			                            &difference,
+			                            &local_rect,
+			                            &task->buffer.w,
+			                            &f};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+			/* Reuse previous arguments. */
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *construct_gramian_args[] = {&dx, &dy,
+			                                  &blurDifference,
+			                                  &task->buffer.mem.device_pointer,
+			                                  &task->storage.transform.device_pointer,
+			                                  &task->storage.rank.device_pointer,
+			                                  &task->storage.XtWX.device_pointer,
+			                                  &task->storage.XtWY.device_pointer,
+			                                  &local_rect,
+			                                  &task->reconstruction_state.filter_rect,
+			                                  &task->buffer.w,
+			                                  &task->buffer.h,
+			                                  &f,
+		                                      &task->buffer.pass_stride};
+			CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+		}
+
+		void *finalize_args[] = {&task->buffer.w,
+		                         &task->buffer.h,
+		                         &output_ptr,
+				                 &task->storage.rank.device_pointer,
+				                 &task->storage.XtWX.device_pointer,
+				                 &task->storage.XtWY.device_pointer,
+				                 &task->filter_area,
+				                 &task->reconstruction_state.buffer_params.x,
+				                 &task->render_buffer.samples};
+		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterCombineHalves;
+		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&mean_ptr,
+		                &variance_ptr,
+		                &a_ptr,
+		                &b_ptr,
+		                &rect,
+		                &r};
+		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterDivideShadow;
+		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+		                &a_ptr,
+		                &b_ptr,
+		                &sample_variance_ptr,
+		                &sv_variance_ptr,
+		                &buffer_variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterGetFeature;
+		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+				        &mean_offset,
+				        &variance_offset,
+		                &mean_ptr,
+		                &variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterDetectOutliers;
+		cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&image_ptr,
+		                &variance_ptr,
+		                &depth_ptr,
+		                &output_ptr,
+		                &task->rect,
+		                &task->buffer.pass_stride};
+
+		CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	void denoise(RenderTile &rtile, const DeviceTask &task)
+	{
+		DenoisingTask denoising(this);
+
+		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+		denoising.render_buffer.samples = rtile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = rtile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+	}
+
 	void path_trace(RenderTile& rtile, int sample, bool branched)
 	{
 		if(have_error())
@@ -1300,7 +1738,7 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
+		if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
 			bool branched = task->integrator_branched;
@@ -1308,47 +1746,56 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
-			if(!use_split_kernel()) {
-				/* keep rendering tiles until done */
-				while(task->acquire_tile(this, tile)) {
-					int start_sample = tile.start_sample;
-					int end_sample = tile.start_sample + tile.num_samples;
+			DeviceRequestedFeatures requested_features;
+			if(use_split_kernel()) {
+				if(!use_adaptive_compilation()) {
+					requested_features.max_closure = 64;
+				}
+
+				if(split_kernel == NULL) {
+					split_kernel = new CUDASplitKernel(this);
+					split_kernel->load_kernels(requested_features);
+				}
+			}
 
-					for(int sample = start_sample; sample < end_sample; sample++) {
-						if(task->get_cancel()) {
-							if(task->need_finish_queue == false)
-								break;
-						}
+			/* keep rendering tiles until done */
+			while(task->acquire_tile(this, tile)) {
+				if(tile.task == RenderTile::PATH_TRACE) {
+					if(use_split_kernel()) {
+						device_memory void_buffer;
+						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+					}
+					else {
+						int start_sample = tile.start_sample;
+						int end_sample = tile.start_sample + tile.num_samples;
 
-						path_trace(tile, sample, branched);
+						for(int sample = start_sample; sample < end_sample; sample++) {
+							if(task->get_cancel()) {
+								if(task->need_finish_queue == false)
+									break;
+							}
 
-						tile.sample = sample + 1;
+							path_trace(tile, sample, branched);
 
-						task->update_progress(&tile, tile.w*tile.h);
-					}
+							tile.sample = sample + 1;
 
-					task->release_tile(tile);
-				}
-			}
-			else {
-				DeviceRequestedFeatures requested_features;
-				if(!use_adaptive_compilation()) {
-					requested_features.max_closure = 64;
+							task->update_progress(&tile, tile.w*tile.h);
+						}
+					}
 				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
 
-				CUDASplitKernel split_kernel(this);
-				split_kernel.load_kernels(requested_features);
+					denoise(tile, *task);
 
-				while(task->acquire_tile(this, tile)) {
-					device_memory void_buffer;
-					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+					task->update_progress(&tile, tile.w*tile.h);
+				}
 
-					task->release_tile(tile);
+				task->release_tile(tile);
 
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+				if(task->get_cancel()) {
+					if(task->need_finish_queue == false)
+						break;
 				}
 			}
 		}
@@ -1591,7 +2038,8 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
 	return !device->have_error();
 }
 
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
+                                                                const DeviceRequestedFeatures&)
 {
 	CUfunction func;
 
@@ -1627,7 +2075,8 @@ int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory&
 	        << string_human_readable_size(free) << ").";
 
 	size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-	int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
+	size_t side = round_down((int)sqrt(num_elements), 32);
+	int2 global_size = make_int2(side, round_down(num_elements / side, 16));
 	VLOG(1) << "Global size: " << global_size << ".";
 	return global_size;
 }
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
new file mode 100644
index 00000000000..619cc1d171e
--- /dev/null
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+	radius = task.denoising_radius;
+	nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+	if(task.denoising_relative_pca) {
+		pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+	}
+	else {
+		pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+	}
+
+	render_buffer.pass_stride = task.pass_stride;
+	render_buffer.denoising_data_offset  = task.pass_denoising_data;
+	render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+	rect = make_int4(max(tiles->x[0], filter_area.x - radius),
+	                 max(tiles->y[0], filter_area.y - radius),
+	                 min(tiles->x[3], filter_area.x + filter_area.z + radius),
+	                 min(tiles->y[3], filter_area.y + filter_area.w + radius));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+	tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
+
+	device_ptr buffers[9];
+	for(int i = 0; i < 9; i++) {
+		buffers[i] = rtiles[i].buffer;
+		tiles->offsets[i] = rtiles[i].offset;
+		tiles->strides[i] = rtiles[i].stride;
+	}
+	tiles->x[0] = rtiles[3].x;
+	tiles->x[1] = rtiles[4].x;
+	tiles->x[2] = rtiles[5].x;
+	tiles->x[3] = rtiles[5].x + rtiles[5].w;
+	tiles->y[0] = rtiles[1].y;
+	tiles->y[1] = rtiles[4].y;
+	tiles->y[2] = rtiles[7].y;
+	tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+	render_buffer.offset = rtiles[4].offset;
+	render_buffer.stride = rtiles[4].stride;
+	render_buffer.ptr    = rtiles[4].buffer;
+
+	functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+	/* Allocate denoising buffer. */
+	buffer.passes = 14;
+	buffer.w = align_up(rect.z - rect.x, 4);
+	buffer.h = rect.w - rect.y;
+	buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+	buffer.mem.resize(buffer.pass_stride * buffer.passes);
+	device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
+
+	device_ptr null_ptr = (device_ptr) 0;
+
+	/* Prefilter shadow feature. */
+	{
+		device_sub_ptr unfiltered_a   (device, buffer.mem, 0,                    buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr unfiltered_b   (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr sample_var     (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr buffer_var     (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr filtered_var   (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+		functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+		/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+		nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+		functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+		/* Reuse memory, the previous data isn't needed anymore. */
+		device_ptr filtered_a = *buffer_var,
+		           filtered_b = *sample_var;
+		/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+		nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+		functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+		functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+		device_ptr residual_var = *sample_var_var;
+		/* Estimate the residual variance between the two filtered halves. */
+		functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+		device_ptr final_a = *unfiltered_a,
+		           final_b = *unfiltered_b;
+		/* Use the residual variance for a second filter pass. */
+		nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+		functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+		functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+		/* Combine the two double-filtered halves to a final shadow feature. */
+		device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+	}
+
+	/* Prefilter general features. */
+	{
+		device_sub_ptr unfiltered     (device, buffer.mem,  8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr variance       (device, buffer.mem,  9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		int mean_from[]     = { 0, 1, 2, 12, 6,  7, 8 };
+		int variance_from[] = { 3, 4, 5, 13, 9, 10, 11};
+		int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7};
+		for(int pass = 0; pass < 7; pass++) {
+			device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			/* Get the unfiltered pass and its variance from the RenderBuffers. */
+			functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+			/* Smooth the pass and store the result in the denoising buffers. */
+			nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+			functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+		}
+	}
+
+	/* Copy color passes. */
+	{
+		int mean_from[]     = {20, 21, 22};
+		int variance_from[] = {23, 24, 25};
+		int mean_to[]       = { 8,  9, 10};
+		int variance_to[]   = {11, 12, 13};
+		int num_color_passes = 3;
+
+		device_only_memory<float> temp_color;
+		temp_color.resize(3*buffer.pass_stride);
+		device->mem_alloc("Denoising temporary color", temp_color, MEM_READ_WRITE);
+
+		for(int pass = 0; pass < num_color_passes; pass++) {
+			device_sub_ptr color_pass(device, temp_color, pass*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+		}
+
+		{
+			device_sub_ptr depth_pass    (device, buffer.mem,                                 0,   buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr color_var_pass(device, buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr output_pass   (device, buffer.mem,     mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+			functions.detect_outliers(temp_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+		}
+
+		device->mem_free(temp_color);
+	}
+
+	storage.w = filter_area.z;
+	storage.h = filter_area.w;
+	storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
+	storage.rank.resize(storage.w*storage.h);
+	device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
+	device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
+
+	functions.construct_transform();
+
+	device_only_memory<float> temporary_1;
+	device_only_memory<float> temporary_2;
+	temporary_1.resize(buffer.w*buffer.h);
+	temporary_2.resize(buffer.w*buffer.h);
+	device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
+	device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
+	reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+	reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+	storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
+	storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
+	device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
+	device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
+
+	reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+	int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+	reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+	                                               render_buffer.stride,
+	                                               render_buffer.pass_stride,
+	                                               render_buffer.denoising_clean_offset);
+	reconstruction_state.source_w = rect.z-rect.x;
+	reconstruction_state.source_h = rect.w-rect.y;
+
+	{
+		device_sub_ptr color_ptr    (device, buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+		functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr);
+	}
+
+	device->mem_free(storage.XtWX);
+	device->mem_free(storage.XtWY);
+	device->mem_free(storage.transform);
+	device->mem_free(storage.rank);
+	device->mem_free(temporary_1);
+	device->mem_free(temporary_2);
+	device->mem_free(buffer.mem);
+	device->mem_free(tiles_mem);
+	return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
new file mode 100644
index 00000000000..def7b72f67d
--- /dev/null
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+	/* Parameters of the denoising algorithm. */
+	int radius;
+	float nlm_k_2;
+	float pca_threshold;
+
+	/* Pointer and parameters of the RenderBuffers. */
+	struct RenderBuffers {
+		int denoising_data_offset;
+		int denoising_clean_offset;
+		int pass_stride;
+		int offset;
+		int stride;
+		device_ptr ptr;
+		int samples;
+	} render_buffer;
+
+	TilesInfo *tiles;
+	device_vector<int> tiles_mem;
+	void tiles_from_rendertiles(RenderTile *rtiles);
+
+	int4 rect;
+	int4 filter_area;
+
+	struct DeviceFunctions {
+		function<bool(device_ptr image_ptr,    /* Contains the values that are smoothed. */
+		              device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
+		              device_ptr variance_ptr, /* Contains the variance of the guide image. */
+		              device_ptr out_ptr       /* The filtered output is written into this image. */
+		              )> non_local_means;
+		function<bool(device_ptr color_ptr,
+		              device_ptr color_variance_ptr,
+		              device_ptr output_ptr
+		              )> reconstruct;
+		function<bool()> construct_transform;
+
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr,
+		              int r,
+		              int4 rect
+		              )> combine_halves;
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr sample_variance_ptr,
+		              device_ptr sv_variance_ptr,
+		              device_ptr buffer_variance_ptr
+		              )> divide_shadow;
+		function<bool(int mean_offset,
+		              int variance_offset,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr
+		              )> get_feature;
+		function<bool(device_ptr image_ptr,
+		              device_ptr variance_ptr,
+		              device_ptr depth_ptr,
+		              device_ptr output_ptr
+		              )> detect_outliers;
+		function<bool(device_ptr*)> set_tiles;
+	} functions;
+
+	/* Stores state of the current Reconstruction operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct ReconstructionState {
+		device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+
+		int4 filter_rect;
+		int4 buffer_params;
+
+		int source_w;
+		int source_h;
+	} reconstruction_state;
+
+	/* Stores state of the current NLM operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct NLMState {
+		device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+		device_ptr temporary_3_ptr;
+
+		int r;      /* Search radius of the filter. */
+		int f;      /* Patch size of the filter. */
+		float a;    /* Variance compensation factor in the MSE estimation. */
+		float k_2;  /* Squared value of the k parameter of the filter. */
+
+		void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+	} nlm_state;
+
+	struct Storage {
+		device_only_memory<float>  transform;
+		device_only_memory<int>    rank;
+		device_only_memory<float>  XtWX;
+		device_only_memory<float3> XtWY;
+		int w;
+		int h;
+	} storage;
+
+	DenoisingTask(Device *device) : device(device) {}
+
+	void init_from_devicetask(const DeviceTask &task);
+
+	bool run_denoising();
+
+	struct DenoiseBuffers {
+		int pass_stride;
+		int passes;
+		int w;
+		int h;
+		device_only_memory<float> mem;
+	} buffer;
+
+protected:
+	Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 4b10514a9d2..b63dd00068b 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -35,6 +35,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+class Device;
+
 enum MemoryType {
 	MEM_READ_ONLY,
 	MEM_WRITE_ONLY,
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
 
 template<> struct device_type_traits<float3> {
 	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 3;
+	static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<float4> {
@@ -173,6 +175,9 @@ class device_memory
 {
 public:
 	size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+	size_t memory_elements_size(int elements) {
+		return elements*data_elements*datatype_size(data_type);
+	}
 
 	/* data information */
 	DataType data_type;
@@ -213,6 +218,22 @@ protected:
 	device_memory& operator = (const device_memory&);
 };
 
+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+	device_only_memory()
+	{
+		data_type = device_type_traits<T>::data_type;
+		data_elements = max(device_type_traits<T>::num_elements, 1);
+	}
+
+	void resize(size_t num)
+	{
+		device_memory::resize(num*sizeof(T));
+	}
+};
+
 /* Device Vector */
 
 template<typename T> class device_vector : public device_memory
@@ -299,6 +320,27 @@ private:
 	array<T> data;
 };
 
+/* A device_sub_ptr is a pointer into another existing memory.
+ * Therefore, it is not allocated separately, but just created from the already allocated base memory.
+ * It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
+ * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
+class device_sub_ptr
+{
+public:
+	device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
+	~device_sub_ptr();
+	/* No copying. */
+	device_sub_ptr& operator = (const device_sub_ptr&);
+
+	device_ptr operator*() const
+	{
+		return ptr;
+	}
+protected:
+	Device *device;
+	device_ptr ptr;
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 624260a81c8..bc505b676fc 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -299,6 +299,60 @@ public:
 		return -1;
 	}
 
+	void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+			/* If the tile was rendered on another device, copy its memory to
+			 * to the current device now, for the duration of the denoising task.
+			 * Note that this temporarily modifies the RenderBuffers and calls
+			 * the device, so this function is not thread safe. */
+			if(tiles[i].buffers->device != sub_device) {
+				device_vector<float> &mem = tiles[i].buffers->buffer;
+
+				tiles[i].buffers->copy_from_device();
+				device_ptr original_ptr = mem.device_pointer;
+				mem.device_pointer = 0;
+				sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
+				sub_device->mem_copy_to(mem);
+				tiles[i].buffer = mem.device_pointer;
+				mem.device_pointer = original_ptr;
+			}
+		}
+	}
+
+	void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+			if(tiles[i].buffers->device != sub_device) {
+				device_vector<float> &mem = tiles[i].buffers->buffer;
+
+				device_ptr original_ptr = mem.device_pointer;
+				mem.device_pointer = tiles[i].buffer;
+
+				/* Copy denoised tile to the host. */
+				if(i == 4) {
+					tiles[i].buffers->copy_from_device(sub_device);
+				}
+
+				size_t mem_size = mem.device_size;
+				sub_device->mem_free(mem);
+				mem.device_pointer = original_ptr;
+				mem.device_size = mem_size;
+
+				/* Copy denoised tile to the original device. */
+				if(i == 4) {
+					tiles[i].buffers->device->mem_copy_to(mem);
+				}
+			}
+		}
+	}
+
 	int get_split_task_count(DeviceTask& task)
 	{
 		int total_tasks = 0;
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index edd2047debc..681b8214b03 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -130,10 +130,22 @@ string device_opencl_capabilities(void)
 		opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
 		result += string_printf("%s: %s\n", name, data); \
 	} while(false)
+#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
+	do { \
+		char data[1024] = "\0"; \
+		size_t length = 0; \
+		if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
+			if(length != 0 && data[0] != '\0') { \
+				result += string_printf("%s: %s\n", name, data); \
+			} \
+		} \
+	} while(false)
 #define APPEND_PLATFORM_STRING_INFO(id, name, what) \
 	APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what)
 #define APPEND_DEVICE_STRING_INFO(id, name, what) \
 	APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
+#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
+	APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
 
 	vector<cl_device_id> device_ids;
 	for(cl_uint platform = 0; platform < num_platforms; ++platform) {
@@ -167,6 +179,7 @@ string device_opencl_capabilities(void)
 			result += string_printf("\t\tDevice: #%u\n", device);
 
 			APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
+			APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
 			APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
 			APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
 			APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 981ec74fe56..d2b3a89fa98 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -19,6 +19,7 @@
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data_types.h"
 
+#include "util/util_logging.h"
 #include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
@@ -38,12 +39,15 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
 	kernel_do_volume = NULL;
 	kernel_queue_enqueue = NULL;
 	kernel_indirect_background = NULL;
+	kernel_shader_setup = NULL;
+	kernel_shader_sort = NULL;
 	kernel_shader_eval = NULL;
 	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
 	kernel_subsurface_scatter = NULL;
 	kernel_direct_lighting = NULL;
 	kernel_shadow_blocked_ao = NULL;
 	kernel_shadow_blocked_dl = NULL;
+	kernel_enqueue_inactive = NULL;
 	kernel_next_iteration_setup = NULL;
 	kernel_indirect_subsurface = NULL;
 	kernel_buffer_update = NULL;
@@ -63,12 +67,15 @@ DeviceSplitKernel::~DeviceSplitKernel()
 	delete kernel_do_volume;
 	delete kernel_queue_enqueue;
 	delete kernel_indirect_background;
+	delete kernel_shader_setup;
+	delete kernel_shader_sort;
 	delete kernel_shader_eval;
 	delete kernel_holdout_emission_blurring_pathtermination_ao;
 	delete kernel_subsurface_scatter;
 	delete kernel_direct_lighting;
 	delete kernel_shadow_blocked_ao;
 	delete kernel_shadow_blocked_dl;
+	delete kernel_enqueue_inactive;
 	delete kernel_next_iteration_setup;
 	delete kernel_indirect_subsurface;
 	delete kernel_buffer_update;
@@ -88,12 +95,15 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 	LOAD_KERNEL(do_volume);
 	LOAD_KERNEL(queue_enqueue);
 	LOAD_KERNEL(indirect_background);
+	LOAD_KERNEL(shader_setup);
+	LOAD_KERNEL(shader_sort);
 	LOAD_KERNEL(shader_eval);
 	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
 	LOAD_KERNEL(subsurface_scatter);
 	LOAD_KERNEL(direct_lighting);
 	LOAD_KERNEL(shadow_blocked_ao);
 	LOAD_KERNEL(shadow_blocked_dl);
+	LOAD_KERNEL(enqueue_inactive);
 	LOAD_KERNEL(next_iteration_setup);
 	LOAD_KERNEL(indirect_subsurface);
 	LOAD_KERNEL(buffer_update);
@@ -108,6 +118,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
 {
 	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	VLOG(1) << "Split state element size: "
+	        << string_human_readable_number(size_per_element) << " bytes. ("
+	        << string_human_readable_size(size_per_element) << ").";
 	return max_buffer_size / size_per_element;
 }
 
@@ -156,13 +169,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
 
 		/* Allocate work_pool_wgs memory. */
-		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		work_pool_wgs.resize(max_work_groups);
 		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
 
-		queue_index.resize(NUM_QUEUES * sizeof(int));
+		queue_index.resize(NUM_QUEUES);
 		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
 
-		use_queues_flag.resize(sizeof(char));
+		use_queues_flag.resize(1);
 		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
 
 		ray_state.resize(num_global_elements);
@@ -227,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
 
 		bool activeRaysAvailable = true;
+		double cancel_time = DBL_MAX;
 
 		while(activeRaysAvailable) {
 			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
@@ -236,18 +250,29 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
 
-				if(task->get_cancel()) {
+				if(task->get_cancel() && cancel_time == DBL_MAX) {
+					/* Wait up to twice as many seconds for current samples to finish 
+					 * to avoid artifacts in render result from ending too soon.
+					 */
+					cancel_time = time_dt() + 2.0 * time_multiplier;
+				}
+
+				if(time_dt() > cancel_time) {
 					return true;
 				}
 			}
@@ -271,7 +296,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 				}
 			}
 
-			if(task->get_cancel()) {
+			if(time_dt() > cancel_time) {
 				return true;
 			}
 		}
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 55548122c0c..9c42cb58520 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -61,12 +61,15 @@ private:
 	SplitKernelFunction *kernel_do_volume;
 	SplitKernelFunction *kernel_queue_enqueue;
 	SplitKernelFunction *kernel_indirect_background;
+	SplitKernelFunction *kernel_shader_setup;
+	SplitKernelFunction *kernel_shader_sort;
 	SplitKernelFunction *kernel_shader_eval;
 	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
 	SplitKernelFunction *kernel_subsurface_scatter;
 	SplitKernelFunction *kernel_direct_lighting;
 	SplitKernelFunction *kernel_shadow_blocked_ao;
 	SplitKernelFunction *kernel_shadow_blocked_dl;
+	SplitKernelFunction *kernel_enqueue_inactive;
 	SplitKernelFunction *kernel_next_iteration_setup;
 	SplitKernelFunction *kernel_indirect_subsurface;
 	SplitKernelFunction *kernel_buffer_update;
@@ -78,16 +81,16 @@ private:
 	 */
 	device_memory split_data;
 	device_vector<uchar> ray_state;
-	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+	device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
 
 	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	device_memory use_queues_flag;
+	device_only_memory<char> use_queues_flag;
 
 	/* Approximate time it takes to complete one sample */
 	double avg_time_per_sample;
 
 	/* Work pool with respect to each work group. */
-	device_memory work_pool_wgs;
+	device_only_memory<unsigned int> work_pool_wgs;
 
 	/* clos_max value for which the kernels have been loaded currently. */
 	int current_max_closure;
@@ -122,7 +125,8 @@ public:
 	                                            device_memory& use_queues_flag,
 	                                            device_memory& work_pool_wgs) = 0;
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&) = 0;
 	virtual int2 split_kernel_local_size() = 0;
 	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
 };
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index ca303365627..3bc4c310283 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
 	if(type == SHADER) {
 		num = min(shader_w, num);
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 	}
 	else {
 		num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 			tasks.push_back(task);
 		}
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 		for(int i = 0; i < num; i++)
 			tasks.push_back(*this);
 	}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 
 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-	if((type != PATH_TRACE) &&
+	if((type != RENDER) &&
 	   (type != SHADER))
 		return;
 
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index feee89fd6e4..44a1efff1f5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -34,7 +34,7 @@ class Tile;
 
 class DeviceTask : public Task {
 public:
-	typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+	typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
 	Type type;
 
 	int x, y, w, h;
@@ -53,7 +53,7 @@ public:
 
 	int passes_size;
 
-	explicit DeviceTask(Type type = PATH_TRACE);
+	explicit DeviceTask(Type type = RENDER);
 
 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -65,6 +65,16 @@ public:
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
+	function<void(RenderTile*, Device*)> map_neighbor_tiles;
+	function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+	int pass_stride;
+	int pass_denoising_data;
+	int pass_denoising_clean;
 
 	bool need_finish_queue;
 	bool integrator_branched;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 764216d0dfa..78ca377d933 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -17,6 +17,7 @@
 #ifdef WITH_OPENCL
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 
 #include "util/util_map.h"
 #include "util/util_param.h"
@@ -26,24 +27,24 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Disable workarounds, seems to be working fine on latest drivers. */
+#define CYCLES_DISABLE_DRIVER_WORKAROUNDS
+
 /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
 #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
 #  undef clEnqueueNDRangeKernel
 #  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
 	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
 	clFinish(a);
 
 #  undef clEnqueueWriteBuffer
 #  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
 	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
 	clFinish(a);
 
 #  undef clEnqueueReadBuffer
 #  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
 	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
 	clFinish(a);
 #endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
@@ -86,7 +87,7 @@ public:
 	                                   string *error = NULL);
 	static bool device_version_check(cl_device_id device,
 	                                 string *error = NULL);
-	static string get_hardware_id(string platform_name,
+	static string get_hardware_id(const string& platform_name,
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
@@ -132,6 +133,13 @@ public:
 	                            cl_int* error = NULL);
 	static cl_device_type get_device_type(cl_device_id device_id);
 
+	static bool get_driver_version(cl_device_id device_id,
+	                               int *major,
+	                               int *minor,
+	                               cl_int* error = NULL);
+
+	static int mem_address_alignment(cl_device_id device_id);
+
 	/* Get somewhat more readable device name.
 	 * Main difference is AMD OpenCL here which only gives code name
 	 * for the regular device name. This will give more sane device
@@ -221,7 +229,7 @@ public:
 		cl_int err = stmt; \
 		\
 		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -242,17 +250,17 @@ public:
 	public:
 		OpenCLProgram() : loaded(false), device(NULL) {}
 		OpenCLProgram(OpenCLDeviceBase *device,
-		              string program_name,
-		              string kernel_name,
-		              string kernel_build_options,
+		              const string& program_name,
+		              const string& kernel_name,
+		              const string& kernel_build_options,
 		              bool use_stdout = true);
 		~OpenCLProgram();
 
 		void add_kernel(ustring name);
 		void load();
 
-		bool is_loaded()    { return loaded; }
-		string get_log()    { return log; }
+		bool is_loaded() const { return loaded; }
+		const string& get_log() const { return log; }
 		void report_error();
 
 		cl_kernel operator()();
@@ -266,8 +274,8 @@ public:
 		bool load_binary(const string& clbin, const string *debug_src = NULL);
 		bool save_binary(const string& clbin);
 
-		void add_log(string msg, bool is_debug);
-		void add_error(string msg);
+		void add_log(const string& msg, bool is_debug);
+		void add_error(const string& msg);
 
 		bool loaded;
 		cl_program program;
@@ -285,7 +293,7 @@ public:
 		map<ustring, cl_kernel> kernels;
 	};
 
-	OpenCLProgram base_program;
+	OpenCLProgram base_program, denoising_program;
 
 	typedef map<string, device_vector<uchar>*> ConstMemMap;
 	typedef map<string, device_ptr> MemMap;
@@ -323,6 +331,9 @@ public:
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
 	void mem_free(device_memory& mem);
+
+	int mem_address_alignment();
+
 	void const_copy_to(const char *name, void *host, size_t size);
 	void tex_alloc(const char *name,
 	               device_memory& mem,
@@ -331,12 +342,14 @@ public:
 	void tex_free(device_memory& mem);
 
 	size_t global_size_round_up(int group_size, int global_size);
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);
 
+	void denoise(RenderTile& tile, const DeviceTask& task);
+
 	class OpenCLDeviceTask : public DeviceTask {
 	public:
 		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -370,9 +383,51 @@ public:
 
 	virtual void thread_run(DeviceTask * /*task*/) = 0;
 
+	virtual bool is_split_kernel() = 0;
+
 protected:
 	string kernel_build_options(const string *debug_src = NULL);
 
+	void mem_zero_kernel(device_ptr ptr, size_t size);
+
+	bool denoising_non_local_means(device_ptr image_ptr,
+	                               device_ptr guide_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr out_ptr,
+	                               DenoisingTask *task);
+	bool denoising_construct_transform(DenoisingTask *task);
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task);
+	bool denoising_combine_halves(device_ptr a_ptr,
+	                              device_ptr b_ptr,
+	                              device_ptr mean_ptr,
+	                              device_ptr variance_ptr,
+	                              int r, int4 rect,
+	                              DenoisingTask *task);
+	bool denoising_divide_shadow(device_ptr a_ptr,
+	                             device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr,
+	                             device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr,
+	                             DenoisingTask *task);
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task);
+	bool denoising_detect_outliers(device_ptr image_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr depth_ptr,
+	                               device_ptr output_ptr,
+	                               DenoisingTask *task);
+	bool denoising_set_tiles(device_ptr *buffers,
+	                         DenoisingTask *task);
+
+	device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
+	void mem_free_sub_ptr(device_ptr ptr);
+
 	class ArgumentWrapper {
 	public:
 		ArgumentWrapper() : size(0), pointer(NULL)
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 52d0662a8e3..509da7a0a84 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -20,6 +20,7 @@
 
 #include "kernel/kernel_types.h"
 
+#include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_md5.h"
@@ -213,8 +214,24 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("bake"));
 	base_program.add_kernel(ustring("zero_buffer"));
 
+	denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+	denoising_program.add_kernel(ustring("filter_divide_shadow"));
+	denoising_program.add_kernel(ustring("filter_get_feature"));
+	denoising_program.add_kernel(ustring("filter_detect_outliers"));
+	denoising_program.add_kernel(ustring("filter_combine_halves"));
+	denoising_program.add_kernel(ustring("filter_construct_transform"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+	denoising_program.add_kernel(ustring("filter_nlm_blur"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+	denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+	denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+	denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+	denoising_program.add_kernel(ustring("filter_finalize"));
+	denoising_program.add_kernel(ustring("filter_set_tiles"));
+
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
+	programs.push_back(&denoising_program);
 	/* Call actual class to fill the vector with its programs. */
 	if(!load_kernels(requested_features, programs)) {
 		return false;
@@ -260,6 +277,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp
 
 	size_t size = mem.memory_size();
 
+	/* check there is enough memory available for the allocation */
+	cl_ulong max_alloc_size = 0;
+	clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
+
+	if(DebugFlags().opencl.mem_limit) {
+		max_alloc_size = min(max_alloc_size,
+		                     cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
+	}
+
+	if(size > max_alloc_size) {
+		string error = "Scene too complex to fit in available memory.";
+		if(name != NULL) {
+			error += string_printf(" (allocating buffer %s failed.)", name);
+		}
+		set_error(error);
+
+		return;
+	}
+
 	cl_mem_flags mem_flag;
 	void *mem_ptr = NULL;
 
@@ -322,37 +358,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 	                                  NULL, NULL));
 }
 
-void OpenCLDeviceBase::mem_zero(device_memory& mem)
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
 {
-	if(mem.device_pointer) {
-		if(base_program.is_loaded()) {
-			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+	cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
 
-			size_t global_size[] = {1024, 1024};
-			size_t num_threads = global_size[0] * global_size[1];
+	size_t global_size[] = {1024, 1024};
+	size_t num_threads = global_size[0] * global_size[1];
 
-			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
-			cl_ulong d_offset = 0;
-			cl_ulong d_size = 0;
+	cl_mem d_buffer = CL_MEM_PTR(mem);
+	cl_ulong d_offset = 0;
+	cl_ulong d_size = 0;
 
-			while(d_offset < mem.memory_size()) {
-				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+	while(d_offset < size) {
+		d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
 
-				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+		kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
 
-				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
-				                               ckZeroBuffer,
-				                               2,
-				                               NULL,
-				                               global_size,
-				                               NULL,
-				                               0,
-				                               NULL,
-				                               NULL);
-				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+		ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+		                               ckZeroBuffer,
+		                               2,
+		                               NULL,
+		                               global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+		opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
 
-				d_offset += d_size;
-			}
+		d_offset += d_size;
+	}
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		if(base_program.is_loaded()) {
+			mem_zero_kernel(mem.device_pointer, mem.memory_size());
 		}
 
 		if(mem.data_pointer) {
@@ -396,6 +437,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
 	}
 }
 
+int OpenCLDeviceBase::mem_address_alignment()
+{
+	return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+	cl_mem_flags mem_flag;
+	if(type == MEM_READ_ONLY)
+		mem_flag = CL_MEM_READ_ONLY;
+	else if(type == MEM_WRITE_ONLY)
+		mem_flag = CL_MEM_WRITE_ONLY;
+	else
+		mem_flag = CL_MEM_READ_WRITE;
+
+	cl_buffer_region info;
+	info.origin = mem.memory_elements_size(offset);
+	info.size = mem.memory_elements_size(size);
+
+	device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+	                                                    mem_flag,
+	                                                    CL_BUFFER_CREATE_TYPE_REGION,
+	                                                    &info,
+	                                                    &ciErr);
+	opencl_assert_err(ciErr, "clCreateSubBuffer");
+	return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+	if(device_pointer && device_pointer != null_mem) {
+		opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+	}
+}
+
 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 {
 	ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +525,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
 	return global_size + ((r == 0)? 0: group_size - r);
 }
 
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
 {
 	size_t workgroup_size, max_work_items[3];
 
@@ -458,6 +534,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
 	clGetDeviceInfo(cdDevice,
 		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
 
+	if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+		workgroup_size = max_workgroup_size;
+	}
+
 	/* Try to divide evenly over 2 dimensions. */
 	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
 	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +623,380 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
 	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 }
 
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+                                                 device_ptr guide_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr out_ptr,
+                                                 DenoisingTask *task)
+{
+	int4 rect = task->rect;
+	int w = rect.z-rect.x;
+	int h = rect.w-rect.y;
+	int r = task->nlm_state.r;
+	int f = task->nlm_state.f;
+	float a = task->nlm_state.a;
+	float k_2 = task->nlm_state.k_2;
+
+	cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+	cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+	cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+	mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+	mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+	cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
+	cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
+
+	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+		int dy = i / (2*r+1) - r;
+		int dx = i % (2*r+1) - r;
+		int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+		kernel_set_args(ckNLMCalcDifference, 0,
+		                dx, dy, guide_mem, variance_mem,
+		                difference, local_rect, w, 0, a, k_2);
+		kernel_set_args(ckNLMBlur, 0,
+		                difference, blurDifference, local_rect, w, f);
+		kernel_set_args(ckNLMCalcWeight, 0,
+		                blurDifference, difference, local_rect, w, f);
+		kernel_set_args(ckNLMUpdateOutput, 0,
+		                dx, dy, blurDifference, image_mem,
+		                out_mem, weightAccum, local_rect, w, f);
+
+		enqueue_kernel(ckNLMCalcDifference, w, h);
+		enqueue_kernel(ckNLMBlur,           w, h);
+		enqueue_kernel(ckNLMCalcWeight,     w, h);
+		enqueue_kernel(ckNLMBlur,           w, h);
+		enqueue_kernel(ckNLMUpdateOutput,   w, h);
+	}
+
+	int4 local_rect = make_int4(0, 0, w, h);
+	kernel_set_args(ckNLMNormalize, 0,
+	                out_mem, weightAccum, local_rect, w);
+	enqueue_kernel(ckNLMNormalize, w, h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+	cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+	kernel_set_args(ckFilterConstructTransform, 0,
+	                buffer_mem,
+	                transform_mem,
+	                rank_mem,
+	                task->filter_area,
+	                task->rect,
+	                task->buffer.pass_stride,
+	                task->radius,
+	                task->pca_threshold);
+
+	enqueue_kernel(ckFilterConstructTransform,
+	               task->storage.w,
+	               task->storage.h,
+	               256);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+                                             device_ptr color_variance_ptr,
+                                             device_ptr output_ptr,
+                                             DenoisingTask *task)
+{
+	mem_zero(task->storage.XtWX);
+	mem_zero(task->storage.XtWY);
+
+	cl_mem color_mem = CL_MEM_PTR(color_ptr);
+	cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+	cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+	cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+	cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+	cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
+
+	cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+	cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+	int r = task->radius;
+	int f = 4;
+	float a = 1.0f;
+	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+		int dy = i / (2*r+1) - r;
+		int dx = i % (2*r+1) - r;
+
+		int local_rect[4] = {max(0, -dx), max(0, -dy),
+		                     task->reconstruction_state.source_w - max(0, dx),
+		                     task->reconstruction_state.source_h - max(0, dy)};
+
+		kernel_set_args(ckNLMCalcDifference, 0,
+		                dx, dy,
+		                color_mem,
+		                color_variance_mem,
+		                difference,
+		                local_rect,
+		                task->buffer.w,
+		                task->buffer.pass_stride,
+		                a, task->nlm_k_2);
+		enqueue_kernel(ckNLMCalcDifference,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMBlur, 0,
+		                difference,
+		                blurDifference,
+		                local_rect,
+		                task->buffer.w,
+		                f);
+		enqueue_kernel(ckNLMBlur,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMCalcWeight, 0,
+		                blurDifference,
+		                difference,
+		                local_rect,
+		                task->buffer.w,
+		                f);
+		enqueue_kernel(ckNLMCalcWeight,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		/* Reuse previous arguments. */
+		enqueue_kernel(ckNLMBlur,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMConstructGramian, 0,
+		                dx, dy,
+		                blurDifference,
+		                buffer_mem,
+		                transform_mem,
+		                rank_mem,
+		                XtWX_mem,
+		                XtWY_mem,
+		                local_rect,
+		                task->reconstruction_state.filter_rect,
+		                task->buffer.w,
+		                task->buffer.h,
+		                f,
+	                    task->buffer.pass_stride);
+		enqueue_kernel(ckNLMConstructGramian,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h,
+		               256);
+	}
+
+	kernel_set_args(ckFinalize, 0,
+	                task->buffer.w,
+	                task->buffer.h,
+	                output_mem,
+	                rank_mem,
+	                XtWX_mem,
+	                XtWY_mem,
+	                task->filter_area,
+	                task->reconstruction_state.buffer_params,
+	                task->render_buffer.samples);
+	enqueue_kernel(ckFinalize,
+	               task->reconstruction_state.source_w,
+	               task->reconstruction_state.source_h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+                                                device_ptr b_ptr,
+                                                device_ptr mean_ptr,
+                                                device_ptr variance_ptr,
+                                                int r, int4 rect,
+                                                DenoisingTask *task)
+{
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+	kernel_set_args(ckFilterCombineHalves, 0,
+	                mean_mem,
+	                variance_mem,
+	                a_mem,
+	                b_mem,
+	                rect,
+	                r);
+	enqueue_kernel(ckFilterCombineHalves,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+                                               device_ptr b_ptr,
+                                               device_ptr sample_variance_ptr,
+                                               device_ptr sv_variance_ptr,
+                                               device_ptr buffer_variance_ptr,
+                                               DenoisingTask *task)
+{
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+	cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+	cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+	char split_kernel = is_split_kernel()? 1 : 0;
+	kernel_set_args(ckFilterDivideShadow, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                a_mem,
+	                b_mem,
+	                sample_variance_mem,
+	                sv_variance_mem,
+	                buffer_variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset,
+	                split_kernel);
+	enqueue_kernel(ckFilterDivideShadow,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+                                             int variance_offset,
+                                             device_ptr mean_ptr,
+                                             device_ptr variance_ptr,
+                                             DenoisingTask *task)
+{
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+	char split_kernel = is_split_kernel()? 1 : 0;
+	kernel_set_args(ckFilterGetFeature, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                mean_offset,
+	                variance_offset,
+	                mean_mem,
+	                variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset,
+	                split_kernel);
+	enqueue_kernel(ckFilterGetFeature,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr depth_ptr,
+                                                 device_ptr output_ptr,
+                                                 DenoisingTask *task)
+{
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
+
+	kernel_set_args(ckFilterDetectOutliers, 0,
+	                image_mem,
+	                variance_mem,
+	                depth_mem,
+	                output_mem,
+	                task->rect,
+	                task->buffer.pass_stride);
+	enqueue_kernel(ckFilterDetectOutliers,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+                                           DenoisingTask *task)
+{
+	mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+	mem_copy_to(task->tiles_mem);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+	kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+	for(int i = 0; i < 9; i++) {
+		cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+		kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+	}
+
+	enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+	return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+	DenoisingTask denoising(this);
+
+	denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+	denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+	denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+	denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+	denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+	denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+	denoising.render_buffer.samples = rtile.sample;
+
+	RenderTile rtiles[9];
+	rtiles[4] = rtile;
+	task.map_neighbor_tiles(rtiles, this);
+	denoising.tiles_from_rendertiles(rtiles);
+
+	denoising.init_from_devicetask(task);
+
+	denoising.run_denoising();
+
+	task.unmap_neighbor_tiles(rtiles, this);
+}
+
 void OpenCLDeviceBase::shader(DeviceTask& task)
 {
 	/* cast arguments to cl types */
@@ -612,7 +1066,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 
 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
 {
-	string build_options = "-cl-fast-relaxed-math ";
+	string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
 
 	if(platform_name == "NVIDIA CUDA") {
 		build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
@@ -792,7 +1246,7 @@ void OpenCLDeviceBase::store_cached_kernel(
 }
 
 string OpenCLDeviceBase::build_options_for_base_program(
-        const DeviceRequestedFeatures& /*requested_features*/)
+        const DeviceRequestedFeatures& requested_features)
 {
 	/* TODO(sergey): By default we compile all features, meaning
 	 * mega kernel is not getting feature-based optimizations.
@@ -800,6 +1254,14 @@ string OpenCLDeviceBase::build_options_for_base_program(
 	 * Ideally we need always compile kernel with as less features
 	 * enabled as possible to keep performance at it's max.
 	 */
+
+	/* For now disable baking when not in use as this has major
+	 * impact on kernel build times.
+	 */
+	if(!requested_features.use_baking) {
+		return "-D__NO_BAKING__";
+	}
+
 	return "";
 }
 
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index a2fd1d71156..06c15bcf401 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -108,41 +108,53 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+				if(tile.task == RenderTile::PATH_TRACE) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
+
+						path_trace(tile, sample);
 
-					path_trace(tile, sample);
+						tile.sample = sample + 1;
 
-					tile.sample = sample + 1;
+						task->update_progress(&tile, tile.w*tile.h);
+					}
 
+					/* Complete kernel execution before release tile */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, *task);
 					task->update_progress(&tile, tile.w*tile.h);
 				}
 
-				/* Complete kernel execution before release tile */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
 				task->release_tile(tile);
 			}
 		}
 	}
+
+	bool is_split_kernel()
+	{
+		return false;
+	}
 };
 
 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index b8df57ec7b9..76d9983e9a2 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -25,6 +25,7 @@
 
 #include "device/device_split_kernel.h"
 
+#include "util/util_algorithm.h"
 #include "util/util_logging.h"
 #include "util/util_md5.h"
 #include "util/util_path.h"
@@ -70,6 +71,10 @@ public:
 		delete split_kernel;
 	}
 
+	virtual bool show_samples() const {
+		return true;
+	}
+
 	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
 	{
@@ -100,7 +105,7 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
 			/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -123,21 +128,29 @@ public:
 
 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				split_kernel->path_trace(task,
-				                         tile,
-				                         kgbuffer,
-				                         *const_mem_map["__data"]);
-
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
+				if(tile.task == RenderTile::PATH_TRACE) {
+					assert(tile.task == RenderTile::PATH_TRACE);
+					split_kernel->path_trace(task,
+					                         tile,
+					                         kgbuffer,
+					                         *const_mem_map["__data"]);
+
+					/* Complete kernel execution before release tile. */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, *task);
+					task->update_progress(&tile, tile.w*tile.h);
+				}
 
 				task->release_tile(tile);
 			}
@@ -146,6 +159,11 @@ public:
 		}
 	}
 
+	bool is_split_kernel()
+	{
+		return true;
+	}
+
 protected:
 	/* ** Those guys are for workign around some compiler-specific bugs ** */
 
@@ -159,17 +177,62 @@ protected:
 	friend class OpenCLSplitKernelFunction;
 };
 
+struct CachedSplitMemory {
+	int id;
+	device_memory *split_data;
+	device_memory *ray_state;
+	device_ptr *rng_state;
+	device_memory *queue_index;
+	device_memory *use_queues_flag;
+	device_memory *work_pools;
+	device_ptr *buffer;
+};
+
 class OpenCLSplitKernelFunction : public SplitKernelFunction {
 public:
 	OpenCLDeviceSplitKernel* device;
 	OpenCLDeviceBase::OpenCLProgram program;
+	CachedSplitMemory& cached_memory;
+	int cached_id;
+
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
+			device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
+	{
+	}
 
-	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
-	~OpenCLSplitKernelFunction() { program.release(); }
+	~OpenCLSplitKernelFunction()
+	{
+		program.release();
+	}
 
 	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
 	{
-		device->kernel_set_args(program(), 0, kg, data);
+		if(cached_id != cached_memory.id) {
+			cl_uint start_arg_index =
+				device->kernel_set_args(program(),
+					            0,
+					            kg,
+					            data,
+					            *cached_memory.split_data,
+					            *cached_memory.ray_state,
+					            *cached_memory.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+				device->set_kernel_arg_mem(program(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+
+			start_arg_index +=
+				device->kernel_set_args(program(),
+					            start_arg_index,
+					            *cached_memory.queue_index,
+					            *cached_memory.use_queues_flag,
+					            *cached_memory.work_pools,
+					            *cached_memory.buffer);
+
+			cached_id = cached_memory.id;
+		}
 
 		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
 		                                       program(),
@@ -196,14 +259,15 @@ public:
 
 class OpenCLSplitKernel : public DeviceSplitKernel {
 	OpenCLDeviceSplitKernel *device;
+	CachedSplitMemory cached_memory;
 public:
 	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
 	}
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
 	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
 
 		bool single_program = OpenCLInfo::use_single_program();
 		kernel->program =
@@ -332,6 +396,15 @@ public:
 			return false;
 		}
 
+		cached_memory.split_data = &split_data;
+		cached_memory.ray_state = &ray_state;
+		cached_memory.rng_state = &rtile.rng_state;
+		cached_memory.queue_index = &queue_index;
+		cached_memory.use_queues_flag = &use_queues_flag;
+		cached_memory.work_pools = &work_pool_wgs;
+		cached_memory.buffer = &rtile.buffer;
+		cached_memory.id++;
+
 		return true;
 	}
 
@@ -351,12 +424,18 @@ public:
 
 		cl_ulong max_buffer_size;
 		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+		if(DebugFlags().opencl.mem_limit) {
+			max_buffer_size = min(max_buffer_size,
+			                      cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
+		}
+
 		VLOG(1) << "Maximum device allocation size: "
 		        << string_human_readable_number(max_buffer_size) << " bytes. ("
 		        << string_human_readable_size(max_buffer_size) << ").";
 
 		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
-		int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+		int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
 		VLOG(1) << "Global size: " << global_size << ".";
 		return global_size;
 	}
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index fe1c65a2224..0d34af3e040 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -241,9 +241,9 @@ string OpenCLCache::get_kernel_md5()
 }
 
 OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device,
-                                               string program_name,
-                                               string kernel_file,
-                                               string kernel_build_options,
+                                               const string& program_name,
+                                               const string& kernel_file,
+                                               const string& kernel_build_options,
                                                bool use_stdout)
  : device(device),
    program_name(program_name),
@@ -274,7 +274,7 @@ void OpenCLDeviceBase::OpenCLProgram::release()
 	}
 }
 
-void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
+void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug)
 {
 	if(!use_stdout) {
 		log += msg + "\n";
@@ -288,7 +288,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
 	}
 }
 
-void OpenCLDeviceBase::OpenCLProgram::add_error(string msg)
+void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg)
 {
 	if(use_stdout) {
 		fprintf(stderr, "%s\n", msg.c_str());
@@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 	if(!get_device_name(device_id, &device_name)) {
 		return false;
 	}
+
+	int driver_major = 0;
+	int driver_minor = 0;
+	if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
+		return false;
+	}
+	VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
 	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
 	 * (aka, it will not be on Intel framework). This isn't supported
 	 * and needs an explicit blacklist.
@@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
+		if(driver_major < 2236) {
+			VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+			return false;
+		}
+		const char *blacklist[] = {
+			/* GCN 1 */
+			"Tahiti", "Pitcairn", "Capeverde", "Oland",
+			NULL
+		};
+		for (int i = 0; blacklist[i] != NULL; i++) {
+			if(device_name == blacklist[i]) {
+				VLOG(1) << "AMD device " << device_name << " not supported";
+				return false;
+			}
+		}
 		return true;
 	}
 	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
@@ -684,7 +707,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device,
 	return true;
 }
 
-string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id)
+string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id)
 {
 	if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
 		/* Use cl_amd_device_topology extension. */
@@ -902,7 +925,7 @@ bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
 string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
 {
 	string platform_name;
-	if (!get_platform_name(platform_id, &platform_name)) {
+	if(!get_platform_name(platform_id, &platform_name)) {
 		return "";
 	}
 	return platform_name;
@@ -1063,7 +1086,7 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	                   CL_DEVICE_BOARD_NAME_AMD,
 	                   sizeof(board_name),
 	                   &board_name,
-					   &length) == CL_SUCCESS)
+	                   &length) == CL_SUCCESS)
 	{
 		if(length != 0 && board_name[0] != '\0') {
 			return board_name;
@@ -1073,6 +1096,48 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	return get_device_name(device_id);
 }
 
+bool OpenCLInfo::get_driver_version(cl_device_id device_id,
+                                    int *major,
+                                    int *minor,
+                                    cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DRIVER_VERSION,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	if(sscanf(buffer, "%d.%d", major, minor) < 2) {
+		VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+		return false;
+	}
+	return true;
+}
+
+int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
+{
+	int base_align_bits;
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+	                   sizeof(int),
+	                   &base_align_bits,
+	                   NULL) == CL_SUCCESS)
+	{
+		return base_align_bits/8;
+	}
+	return 1;
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index dbc2ba2503a..23e9bd311c4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS
 
 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_sse2.cpp
+	kernels/cpu/kernel_sse3.cpp
+	kernels/cpu/kernel_sse41.cpp
+	kernels/cpu/kernel_avx.cpp
+	kernels/cpu/kernel_avx2.cpp
 	kernels/cpu/kernel_split.cpp
+	kernels/cpu/kernel_split_sse2.cpp
+	kernels/cpu/kernel_split_sse3.cpp
+	kernels/cpu/kernel_split_sse41.cpp
+	kernels/cpu/kernel_split_avx.cpp
+	kernels/cpu/kernel_split_avx2.cpp
+	kernels/cpu/filter.cpp
+	kernels/cpu/filter_sse2.cpp
+	kernels/cpu/filter_sse3.cpp
+	kernels/cpu/filter_sse41.cpp
+	kernels/cpu/filter_avx.cpp
+	kernels/cpu/filter_avx2.cpp
 	kernels/opencl/kernel.cl
 	kernels/opencl/kernel_state_buffer_size.cl
 	kernels/opencl/kernel_split.cl
@@ -21,17 +37,22 @@ set(SRC
 	kernels/opencl/kernel_lamp_emission.cl
 	kernels/opencl/kernel_do_volume.cl
 	kernels/opencl/kernel_indirect_background.cl
+	kernels/opencl/kernel_shader_setup.cl
+	kernels/opencl/kernel_shader_sort.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
 	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
 	kernels/opencl/kernel_shadow_blocked_ao.cl
 	kernels/opencl/kernel_shadow_blocked_dl.cl
+	kernels/opencl/kernel_enqueue_inactive.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
 	kernels/opencl/kernel_indirect_subsurface.cl
 	kernels/opencl/kernel_buffer_update.cl
+	kernels/opencl/filter.cl
 	kernels/cuda/kernel.cu
 	kernels/cuda/kernel_split.cu
+	kernels/cuda/filter.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -93,12 +114,18 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu.h
 	kernels/cpu/kernel_cpu_impl.h
 	kernels/cpu/kernel_cpu_image.h
+	kernels/cpu/filter_cpu.h
+	kernels/cpu/filter_cpu_impl.h
 )
 
 set(SRC_KERNELS_CUDA_HEADERS
 	kernels/cuda/kernel_config.h
 )
 
+set(SRC_KERNELS_OPENCL_HEADERS
+	kernels/opencl/kernel_split_function.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -120,6 +147,8 @@ set(SRC_CLOSURE_HEADERS
 	closure/bssrdf.h
 	closure/emissive.h
 	closure/volume.h
+	closure/bsdf_principled_diffuse.h
+	closure/bsdf_principled_sheen.h
 )
 
 set(SRC_SVM_HEADERS
@@ -186,6 +215,21 @@ set(SRC_GEOM_HEADERS
 	geom/geom_volume.h
 )
 
+set(SRC_FILTER_HEADERS
+	filter/filter.h
+	filter/filter_defines.h
+	filter/filter_features.h
+	filter/filter_features_sse.h
+	filter/filter_kernel.h
+	filter/filter_nlm_cpu.h
+	filter/filter_nlm_gpu.h
+	filter/filter_prefilter.h
+	filter/filter_reconstruction.h
+	filter/filter_transform.h
+	filter/filter_transform_gpu.h
+	filter/filter_transform_sse.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
@@ -194,17 +238,52 @@ set(SRC_UTIL_HEADERS
 	../util/util_math.h
 	../util/util_math_fast.h
 	../util/util_math_intersect.h
+	../util/util_math_float2.h
+	../util/util_math_float3.h
+	../util/util_math_float4.h
+	../util/util_math_int2.h
+	../util/util_math_int3.h
+	../util/util_math_int4.h
+	../util/util_math_matrix.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
 	../util/util_types.h
+	../util/util_types_float2.h
+	../util/util_types_float2_impl.h
+	../util/util_types_float3.h
+	../util/util_types_float3_impl.h
+	../util/util_types_float4.h
+	../util/util_types_float4_impl.h
+	../util/util_types_int2.h
+	../util/util_types_int2_impl.h
+	../util/util_types_int3.h
+	../util/util_types_int3_impl.h
+	../util/util_types_int4.h
+	../util/util_types_int4_impl.h
+	../util/util_types_uchar2.h
+	../util/util_types_uchar2_impl.h
+	../util/util_types_uchar3.h
+	../util/util_types_uchar3_impl.h
+	../util/util_types_uchar4.h
+	../util/util_types_uchar4_impl.h
+	../util/util_types_uint2.h
+	../util/util_types_uint2_impl.h
+	../util/util_types_uint3.h
+	../util/util_types_uint3_impl.h
+	../util/util_types_uint4.h
+	../util/util_types_uint4_impl.h
+	../util/util_types_vector3.h
+	../util/util_types_vector3_impl.h
 )
 
 set(SRC_SPLIT_HEADERS
+	split/kernel_branched.h
 	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
 	split/kernel_do_volume.h
+	split/kernel_enqueue_inactive.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
 	split/kernel_indirect_background.h
 	split/kernel_indirect_subsurface.h
@@ -213,6 +292,8 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_path_init.h
 	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
+	split/kernel_shader_setup.h
+	split/kernel_shader_sort.h
 	split/kernel_shader_eval.h
 	split/kernel_shadow_blocked_ao.h
 	split/kernel_shadow_blocked_dl.h
@@ -256,23 +337,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		${SRC_CLOSURE_HEADERS}
 		${SRC_UTIL_HEADERS}
 	)
+	set(cuda_filter_sources kernels/cuda/filter.cu
+		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
+		${SRC_FILTER_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
-		if(${split})
-			set(cuda_extra_flags "-D__SPLIT__")
-			set(cuda_cubin kernel_split)
-		else()
-			set(cuda_extra_flags "")
-			set(cuda_cubin kernel)
-		endif()
-
+	macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
 		if(${experimental})
-			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
-			set(cuda_cubin ${cuda_cubin}_experimental)
+			set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+			set(name ${name}_experimental)
 		endif()
 
-		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+		set(cuda_cubin ${name}_${arch}.cubin)
 
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -286,11 +365,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
-		if(split)
-			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
-		else()
-			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
-		endif()
+		set(cuda_kernel_src "/kernels/cuda/${name}.cu")
 
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
@@ -304,13 +379,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					${cuda_arch_flags}
 					${cuda_version_flags}
 					${cuda_math_flags}
-					${cuda_extra_flags}
+					${flags}
 					${cuda_debug_flags}
 					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
-			DEPENDS ${cuda_sources})
+			DEPENDS ${sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
 		list(APPEND cuda_cubins ${cuda_cubin})
@@ -324,11 +399,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
 
 		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
 			# Compile split kernel
-			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
 		endif()
 	endforeach()
 
@@ -349,41 +425,30 @@ include_directories(SYSTEM ${INC_SYS})
 
 set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
 if(CXX_HAS_SSE)
-	list(APPEND SRC
-		kernels/cpu/kernel_sse2.cpp
-		kernels/cpu/kernel_sse3.cpp
-		kernels/cpu/kernel_sse41.cpp
-		kernels/cpu/kernel_split_sse2.cpp
-		kernels/cpu/kernel_split_sse3.cpp
-		kernels/cpu/kernel_split_sse41.cpp
-	)
-
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx.cpp
-		kernels/cpu/kernel_split_avx.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx2.cpp
-		kernels/cpu/kernel_split_avx2.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
@@ -391,8 +456,10 @@ add_library(cycles_kernel
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
 	${SRC_KERNELS_CUDA_HEADERS}
+	${SRC_KERNELS_OPENCL_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
+	${SRC_FILTER_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
 	${SRC_SPLIT_HEADERS}
@@ -422,21 +489,28 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 9139b99353a..86a00d2124d 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -27,6 +27,8 @@
 #include "kernel/closure/bsdf_ashikhmin_shirley.h"
 #include "kernel/closure/bsdf_toon.h"
 #include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
 #ifdef __SUBSURFACE__
 #  include "kernel/closure/bssrdf.h"
 #endif
@@ -86,16 +88,21 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
@@ -130,6 +137,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
+		case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+			label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -188,14 +206,19 @@ float3 bsdf_eval(KernelGlobals *kg,
 				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
@@ -222,6 +245,15 @@ float3 bsdf_eval(KernelGlobals *kg,
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
+#ifdef __PRINCIPLED__
+			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+				eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+				break;
+			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+				eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+				break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -256,14 +288,19 @@ float3 bsdf_eval(KernelGlobals *kg,
 				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
@@ -290,6 +327,15 @@ float3 bsdf_eval(KernelGlobals *kg,
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
+#ifdef __PRINCIPLED__
+			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+				eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+				break;
+			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+				eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+				break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
@@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #ifdef __SVM__
 	switch(sc->type) {
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 			bsdf_microfacet_multi_ggx_blur(sc, roughness);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 			bsdf_microfacet_ggx_blur(sc, roughness);
 			break;
@@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_REFLECTION_ID:
 		case CLOSURE_BSDF_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
@@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 			return bsdf_hair_merge(a, b);
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			return bsdf_principled_diffuse_merge(a, b);
+#endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
 			return volume_henyey_greenstein_merge(a, b);
@@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 #endif
 }
 
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+	if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+		return true;
+	}
+
+	if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+		return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+	}
+
+	return false;
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
 
 	float sigma;
 	float invsigma2;
-	float3 N;
 } VelvetBsdf;
 
 ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct DiffuseBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } DiffuseBsdf;
 
 /* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct DiffuseRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float3 *colors;
 } DiffuseRampBsdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 1c7b3eb9ddd..b12e248f0a3 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -36,7 +36,8 @@
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct MicrofacetExtra {
-	float3 color;
+	float3 color, cspec0;
+	float clearcoat;
 } MicrofacetExtra;
 
 typedef ccl_addr_space struct MicrofacetBsdf {
@@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
 	float alpha_x, alpha_y, ior;
 	MicrofacetExtra *extra;
 	float3 T;
-	float3 N;
 } MicrofacetBsdf;
 
 /* Beckmann and GGX microfacet importance sampling. */
@@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 } 
 
+/* Calculate the reflection color
+ *
+ * If fresnel is used, the color is an interpolation of the F0 color and white
+ * with respect to the fresnel
+ *
+ * Else it is simply white
+ */
+ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) {
+	float3 F = make_float3(1.0f, 1.0f, 1.0f);
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+	                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+	                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+	if(use_fresnel) {
+		float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+
+		F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+	}
+
+	return F;
+}
+
+ccl_device_forceinline float D_GTR1(float NdotH, float alpha)
+{
+	if(alpha >= 1.0f) return M_1_PI_F;
+	float alpha2 = alpha*alpha;
+	float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH;
+	return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
+}
+
 /* GGX microfacet with Smith shadow-masking from:
  *
  * Microfacet Models for Refraction through Rough Surfaces
@@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
 
 ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
-	
+
 	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
 ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
 {
 	const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a;
@@ -273,16 +341,38 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = saturate(bsdf->alpha_y);
-	
+
 	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = saturate(bsdf->alpha_y);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
 
@@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		float alpha2 = alpha_x * alpha_y;
 		float D, G1o, G1i;
 
+		bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
 		if(alpha_x == alpha_y) {
 			/* isotropic
 			 * eq. 20: (F*G*D)/(4*in*on)
@@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 			float cosThetaM2 = cosThetaM * cosThetaM;
 			float cosThetaM4 = cosThetaM2 * cosThetaM2;
 			float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
-			D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+			if(is_principled_clearcoat) {
+				/* use GTR1 for clearcoat */
+				D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+				/* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+				alpha2 = 0.0625f;
+			}
+			else {
+				/* use GTR2 otherwise */
+				D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+			}
 
 			/* eq. 34: now calculate G1(i,m) and G1(o,m) */
 			G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
@@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 
 		/* eq. 20 */
 		float common = D * 0.25f / cosNO;
-		float out = G * common;
+
+		float3 F = reflection_color(bsdf, omega_in, m);
+		if(is_principled_clearcoat) {
+			F *= 0.25f * bsdf->extra->clearcoat;
+		}
+
+		float3 out = F * G * common;
 
 		/* eq. 2 in distribution of visible normals sampling
 		 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
@@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		 * pdf = pm * 0.25 / dot(m, I); */
 		*pdf = G1o * common;
 
-		return make_float3(out, out, out);
+		return out;
 	}
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -489,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
+
+						bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+						                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+						                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+						/* if fresnel is used, calculate the color with reflection_color(...) */
+						if(use_fresnel) {
+							*pdf = 1.0f;
+							*eval = reflection_color(bsdf, *omega_in, m);
+						}
+
 						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
@@ -497,16 +617,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						float alpha2 = alpha_x * alpha_y;
 						float D, G1i;
 
+						bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
 						if(alpha_x == alpha_y) {
 							/* isotropic */
 							float cosThetaM2 = cosThetaM * cosThetaM;
 							float cosThetaM4 = cosThetaM2 * cosThetaM2;
 							float tanThetaM2 = 1/(cosThetaM2) - 1;
-							D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
 
 							/* eval BRDF*cosNI */
 							float cosNI = dot(N, *omega_in);
 
+							if(is_principled_clearcoat) {
+								/* use GTR1 for clearcoat */
+								D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+								/* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+								alpha2 = 0.0625f;
+
+								/* recalculate G1o */
+								G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+							}
+							else {
+								/* use GTR2 otherwise */
+								D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+							}
+
 							/* eq. 34: now calculate G1(i,m) */
 							G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
 						}
@@ -538,10 +674,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 
 						/* see eval function for derivation */
 						float common = (G1o * D) * 0.25f / cosNO;
-						float out = G1i * common;
 						*pdf = common;
 
-						*eval = make_float3(out, out, out);
+						float3 F = reflection_color(bsdf, *omega_in, m);
+						if(is_principled_clearcoat) {
+							F *= 0.25f * bsdf->extra->clearcoat;
+						}
+
+						*eval = G1i * common * F;
 					}
 
 #ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 7d87727004f..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }
 
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
 {
-	if(cosI > 0.9999f || cosI < 1e-6f) {
-		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
-		const float phi = M_2PI_F * randU.y;
+	if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
+		const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+		const float phi = M_2PI_F * randy;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
 
-	const float sinI = sqrtf(1.0f - cosI*cosI);
+	const float sinI = safe_sqrtf(1.0f - cosI*cosI);
 	const float tanI = sinI/cosI;
 	const float projA = 0.5f * (cosI + 1.0f);
 	if(projA < 0.0001f)
 		return make_float2(0.0f, 0.0f);
-	const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+	const float A = 2.0f*randx*projA / cosI - 1.0f;
 	float tmp = A*A-1.0f;
 	if(fabsf(tmp) < 1e-7f)
 		return make_float2(0.0f, 0.0f);
@@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
 	const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
 
 	float U2;
-	if(randU.y >= 0.5f)
-		U2 = 2.0f*(randU.y - 0.5f);
+	if(randy >= 0.5f)
+		U2 = 2.0f*(randy - 0.5f);
 	else
-		U2 = 2.0f*(0.5f - randU.y);
+		U2 = 2.0f*(0.5f - randy);
 	const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
 	const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
 
-	if(randU.y >= 0.5f)
+	if(randy >= 0.5f)
 		return make_float2(slopeX, slopeY);
 	else
 		return make_float2(slopeX, -slopeY);
 }
 
 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
-	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
 
 	const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
@@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 }
 
-/* === Phase functions: Glossy, Diffuse and Glass === */
+/* === Phase functions: Glossy and Glass === */
 
-/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+/* Phase function for reflective materials. */
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm)
 {
-	if(n && k)
-		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
-
 	return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l
 	else
 		phase *= D_ggx_aniso(wh, alpha);
 
-	if(n && k) {
-		/* Apply conductive fresnel term. */
-		return phase * fresnel_conductor(dotW_WH, *n, *k);
-	}
-
 	return make_float3(phase, phase, phase);
 }
 
-/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
-{
-	float3 tm, bm;
-	make_orthonormals(wm, &tm, &bm);
-
-	float2 disk = concentric_sample_disk(randu, randv);
-	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
-}
-
-ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
-{
-	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
-	return make_float3(v, v, v);
-}
-
 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
 ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
@@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r)
 	return saturate(albedo);
 }
 
+ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior)
+{
+	if(ior < 1.0f) {
+		ior = 1.0f/ior;
+	}
+	a = saturate(a);
+	ior = clamp(ior, 1.0f, 3.0f);
+	float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f;
+	float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f;
+	float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior);
+	float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f;
+
+	return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f);
+}
+
 ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+	float multiscatter = wo.z * M_1_PI_F;
+
 	float albedo = mf_ggx_albedo(alpha);
-	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
-	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
-}
+	float D = D_ggx_aniso(normalize(wi+wo), alpha);
+	float lambda = mf_lambda(wi, alpha);
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
 
-ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
-{
-	return M_1_PI_F * wo.z;
+	float multiscatter = wo.z * M_1_PI_F;
+
+	float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y));
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
-	float3 wh;
-	float fresnel;
-	if(wi.z*wo.z > 0.0f) {
-		wh = normalize(wi + wo);
-		fresnel = fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
-	else {
-		wh = normalize(wi + wo*eta);
-		fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
+	bool reflective = (wi.z*wo.z > 0.0f);
+
+	float wh_len;
+	float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len);
 	if(wh.z < 0.0f)
 		wh = -wh;
 	float3 r_wi = (wi.z < 0.0f)? -wi: wi;
-	return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z);
+	float lambda = mf_lambda(r_wi, make_float2(alpha, alpha));
+	float D = D_ggx(wh, alpha);
+	float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta);
+
+	float multiscatter = fabsf(wo.z * M_1_PI_F);
+	if(reflective) {
+		float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f);
+		float albedo = mf_ggx_albedo(alpha);
+		return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
+	else {
+		float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f));
+		float albedo = mf_ggx_transmission_albedo(alpha, eta);
+		return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
 }
 
 /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
@@ -315,13 +320,6 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
 #define MF_MULTI_GLASS
 #include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
-/* The diffuse phase function is not implemented as a node yet. */
-#if 0
-#define MF_PHASE_FUNCTION diffuse
-#define MF_MULTI_DIFFUSE
-#include "kernel/closure/bsdf_microfacet_multi_impl.h"
-#endif
-
 #define MF_PHASE_FUNCTION glossy
 #define MF_MULTI_GLOSSY
 #include "kernel/closure/bsdf_microfacet_multi_impl.h"
@@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
 	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
 	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
 	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-
-	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
@@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 	if(is_zero(bsdf->T))
 		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
 
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	if(is_zero(bsdf->T))
+		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
 {
 	bsdf->alpha_y = bsdf->alpha_x;
 
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf)
+{
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
 	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	float3 X, Y, Z;
 	Z = bsdf->N;
@@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
 		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
-	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 		*omega_in = 2*dot(Z, I)*Z - I;
 		*pdf = 1e6f;
 		*eval = make_float3(1e6f, 1e6f, 1e6f);
+#ifdef __RAY_DIFFERENTIALS__
+		*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+		*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
 		return LABEL_REFLECT|LABEL_SINGULAR;
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
 	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	if(is_aniso)
 		make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
@@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	if(is_aniso)
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
@@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	*eval *= *pdf;
 
 	*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
 #ifdef __RAY_DIFFERENTIALS__
 	*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
 	*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
@@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
+ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = bsdf->alpha_x;
+	bsdf->ior = max(0.0f, bsdf->ior);
+	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
+}
+
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 
@@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color);
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 		return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
 	float3 X, Y, Z;
 	Z = bsdf->N;
 	make_orthonormals(Z, &X, &Y);
@@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S
 		}
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
 	make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
 	*eval *= *pdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 8054fa8e849..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -26,19 +26,16 @@
  * the balance heuristic isn't necessarily optimal anymore.
  */
 ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
-        float3 wi,
-        float3 wo,
-        const bool wo_outside,
-        const float3 color,
-        const float alpha_x,
-        const float alpha_y,
-         ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-        , const float eta
-#elif defined(MF_MULTI_GLOSSY)
-        , float3 *n, float3 *k
-#endif
-)
+	float3 wi,
+	float3 wo,
+	const bool wo_outside,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	/* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
 	bool swapped = false;
@@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 
 	/* Analytically compute single scattering for lower noise. */
 	float3 eval;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	const float3 wh = normalize(wi+wo);
 #ifdef MF_MULTI_GLASS
 	eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
 	if(wo_outside)
 		eval *= -lambda_r / (shadowing_lambda - lambda_r);
 	else
 		eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#elif defined(MF_MULTI_DIFFUSE)
-	/* Diffuse has no special closed form for the single scattering bounce */
-	eval = make_float3(0.0f, 0.0f, 0.0f);
 #else /* MF_MULTI_GLOSSY */
-	const float3 wh = normalize(wi+wo);
 	const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
 	float val = G2 * 0.25f / wi.z;
 	if(alpha.x == alpha.y)
 		val *= D_ggx(wh, alpha.x);
 	else
 		val *= D_ggx_aniso(wh, alpha);
-	if(n && k) {
-		eval = fresnel_conductor(dot(wh, wi), *n, *k) * val;
-	}
-	else {
-		eval = make_float3(val, val, val);
-	}
+	eval = make_float3(val, val, val);
 #endif
 
+	float F0 = fresnel_dielectric_cos(1.0f, eta);
+	if(use_fresnel) {
+		throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+
+		eval *= throughput;
+	}
+
 	float3 wr = -wi;
 	float hr = 1.0f;
 	float C1_r = 1.0f;
 	float G1_r = 0.0f;
 	bool outside = true;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
 	for(int order = 0; order < 10; order++) {
-		/* Sample microfacet height and normal */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+		/* Sample microfacet height. */
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
 			break;
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
-
-#ifdef MF_MULTI_DIFFUSE
-		if(order == 0) {
-			/* Compute single-scattering for diffuse. */
-			const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r);
-			eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm);
+		/* Sample microfacet normal. */
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
+
+#ifdef MF_MULTI_GLASS
+		if(order == 0 && use_fresnel) {
+			/* Evaluate amount of scattering towards wo on this microfacet. */
+			float3 phase;
+			if(outside)
+				phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
+			else
+				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta);
+
+			eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda);
 		}
 #endif
 		if(order > 0) {
@@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 				phase = mf_eval_phase_glass(wr, lambda_r,  wo,  wo_outside, alpha, eta);
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#elif defined(MF_MULTI_DIFFUSE)
-			phase = mf_eval_phase_diffuse(wo, wm);
 #else /* MF_MULTI_GLOSSY */
-			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput;
+			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
 #endif
 			eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
 		}
@@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
 			bool next_outside;
-			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+			float3 wi_prev = -wr;
+			float phase_rand = lcg_step_float_addrspace(lcg_state);
+			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 			if(!next_outside) {
 				outside = !outside;
 				wr = -wr;
 				hr = -hr;
 			}
-#elif defined(MF_MULTI_DIFFUSE)
-			wr = mf_sample_phase_diffuse(wm,
-			                             lcg_step_float_addrspace(lcg_state),
-			                             lcg_step_float_addrspace(lcg_state));
+
+			if(use_fresnel && !next_outside) {
+				throughput *= color;
+			}
+			else if(use_fresnel && order > 0) {
+				throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+			}
 #else /* MF_MULTI_GLOSSY */
-			wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+			if(use_fresnel && order > 0) {
+				throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+			}
+			wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 			lambda_r = mf_lambda(wr, alpha);
 
-			throughput *= color;
+			if(!use_fresnel)
+				throughput *= color;
 
 			C1_r = mf_C1(hr);
 			G1_r = mf_G1(wr, C1_r, lambda_r);
@@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
  * escaped the surface in wo. The function returns the throughput between wi and wo.
  * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-	, const float eta
-#elif defined(MF_MULTI_GLOSSY)
-	, float3 *n, float3 *k
-#endif
-)
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
+	float3 wi,
+	float3 *wo,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	const float2 alpha = make_float2(alpha_x, alpha_y);
 
@@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 	float G1_r = 0.0f;
 	bool outside = true;
 
+	float F0 = fresnel_dielectric_cos(1.0f, eta);
+	if(use_fresnel) {
+		throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
+	}
+
 	int order;
 	for(order = 0; order < 10; order++) {
 		/* Sample microfacet height. */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
 			/* The random walk has left the surface. */
 			*wo = outside? wr: -wr;
 			return throughput;
 		}
 		/* Sample microfacet normal. */
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
 
 		/* First-bounce color is already accounted for in mix weight. */
-		if(order > 0)
+		if(!use_fresnel && order > 0)
 			throughput *= color;
 
 		/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
 		bool next_outside;
-		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+		float3 wi_prev = -wr;
+		float phase_rand = lcg_step_float_addrspace(lcg_state);
+		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 		if(!next_outside) {
 			hr = -hr;
 			wr = -wr;
 			outside = !outside;
 		}
-#elif defined(MF_MULTI_DIFFUSE)
-		wr = mf_sample_phase_diffuse(wm,
-		                             lcg_step_float_addrspace(lcg_state),
-		                             lcg_step_float_addrspace(lcg_state));
+
+		if(use_fresnel) {
+			if(!next_outside) {
+				throughput *= color;
+			}
+			else {
+				float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+
+				if(order == 0)
+					throughput = t_color;
+				else
+					throughput *= t_color;
+			}
+		}
 #else /* MF_MULTI_GLOSSY */
-		wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+		if(use_fresnel) {
+			float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+
+			if(order == 0)
+				throughput = t_color;
+			else
+				throughput *= t_color;
+		}
+		wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 		/* Update random walk parameters. */
@@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 }
 
 #undef MF_MULTI_GLASS
-#undef MF_MULTI_DIFFUSE
 #undef MF_MULTI_GLOSSY
 #undef MF_PHASE_FUNCTION
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct OrenNayarBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float roughness;
 	float a;
 	float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct PhongRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float exponent;
 	float3 *colors;
 } PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
new file mode 100644
index 00000000000..f8ca64293b0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
+#define __BSDF_PRINCIPLED_DIFFUSE_H__
+
+/* DISNEY PRINCIPLED DIFFUSE BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledDiffuseBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float roughness;
+} PrincipledDiffuseBsdf;
+
+ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
+	float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+	float NdotL = max(dot(N, L), 0.0f);
+	float NdotV = max(dot(N, V), 0.0f);
+
+	if(NdotL < 0 || NdotV < 0) {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+
+	float LdotH = dot(L, H);
+
+	float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV);
+	const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness;
+	float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV);
+
+	float value = M_1_PI_F * NdotL * Fd;
+
+	return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
+{
+	bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a;
+	const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+	float3 N = bsdf->N;
+	float3 V = I; // outgoing
+	float3 L = omega_in; // incoming
+	float3 H = normalize(L + V);
+
+	if(dot(N, omega_in) > 0.0f) {
+		*pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+		return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf);
+	}
+	else {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
+	float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+	float3 *domega_in_dy, float *pdf)
+{
+	const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+	float3 N = bsdf->N;
+
+	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+	if(dot(Ng, *omega_in) > 0) {
+		float3 H = normalize(I + *omega_in);
+
+		*eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		// TODO: find a better approximation for the diffuse bounce
+		*domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+		*domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+	}
+	else {
+		*pdf = 0.0f;
+	}
+	return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
new file mode 100644
index 00000000000..f4476bfecd0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_SHEEN_H__
+#define __BSDF_PRINCIPLED_SHEEN_H__
+
+/* DISNEY PRINCIPLED SHEEN BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledSheenBsdf {
+	SHADER_CLOSURE_BASE;
+} PrincipledSheenBsdf;
+
+ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
+	float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+	float NdotL = dot(N, L);
+	float NdotV = dot(N, V);
+
+	if(NdotL < 0 || NdotV < 0) {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+
+	float LdotH = dot(L, H);
+
+	float value = schlick_fresnel(LdotH) * NdotL;
+
+	return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf)
+{
+	bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+	float3 N = bsdf->N;
+	float3 V = I; // outgoing
+	float3 L = omega_in; // incoming
+	float3 H = normalize(L + V);
+
+	if(dot(N, omega_in) > 0.0f) {
+		*pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+		return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf);
+	}
+	else {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
+	float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+	float3 *domega_in_dy, float *pdf)
+{
+	const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+	float3 N = bsdf->N;
+
+	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+	if(dot(Ng, *omega_in) > 0) {
+		float3 H = normalize(I + *omega_in);
+
+		*eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		// TODO: find a better approximation for the diffuse bounce
+		*domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+		*domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+	}
+	else {
+		*pdf = 0.0f;
+	}
+	return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct ToonBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float size;
 	float smooth;
 } ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b0c5280b6cb..3dc15d5791c 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k
 	return(Rparl2 + Rperp2) * 0.5f;
 }
 
+ccl_device float schlick_fresnel(float u)
+{
+	float m = clamp(1.0f - u, 0.0f, 1.0f);
+	float m2 = m * m;
+	return m2 * m2 * m; // pow(m, 5)
+}
+
 ccl_device float smooth_step(float edge0, float edge1, float x)
 {
 	float result;
@@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x)
 	return result;
 }
 
+/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
+ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) {
+	/* Calculate the fresnel interpolation factor
+	 * The value from fresnel_dielectric_cos(...) has to be normalized because
+	 * the cspec0 keeps the F0 color
+	*/
+	float F0_norm = 1.0f / (1.0f - F0);
+	float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+
+	/* Blend between white and a specular color with respect to the fresnel */
+	return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index af0bbd861a9..f733ea4c517 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf {
 	float d;
 	float texture_blur;
 	float albedo;
-	float3 N;
+	float roughness;
 } Bssrdf;
 
 /* Planar Truncated Gaussian
@@ -360,10 +360,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
 {
 	if(bssrdf->radius < BSSRDF_MIN_RADIUS) {
 		/* revert to diffuse BSDF if radius too small */
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
-		bsdf->N = bssrdf->N;
-		int flag = bsdf_diffuse_setup(bsdf);
-		bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		int flag;
+#ifdef __PRINCIPLED__
+		if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+			float roughness = bssrdf->roughness;
+			float3 N = bssrdf->N;
+			float3 weight = bssrdf->weight;
+			float sample_weight = bssrdf->sample_weight;
+
+			PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf;
+
+			bsdf->N = N;
+			bsdf->roughness = roughness;
+			bsdf->weight = weight;
+			bsdf->sample_weight = sample_weight;
+			flag = bsdf_principled_diffuse_setup(bsdf);
+			bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+		}
+		else
+#endif  /* __PRINCIPLED__ */
+		{
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
+			bsdf->N = bssrdf->N;
+			flag = bsdf_diffuse_setup(bsdf);
+			bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		}
+		
 		return flag;
 	}
 	else {
@@ -371,7 +393,9 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
 		bssrdf->sharpness = saturate(bssrdf->sharpness);
 		bssrdf->type = type;
 
-		if(type == CLOSURE_BSSRDF_BURLEY_ID) {
+		if(type == CLOSURE_BSSRDF_BURLEY_ID ||
+		   type == CLOSURE_BSSRDF_PRINCIPLED_ID)
+		{
 			bssrdf_burley_setup(bssrdf);
 		}
 
@@ -385,7 +409,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
 		bssrdf_cubic_sample(sc, xi, r, h);
 	else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
 		bssrdf_gaussian_sample(sc, xi, r, h);
-	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
 		bssrdf_burley_sample(sc, xi, r, h);
 }
 
@@ -395,7 +419,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 		return bssrdf_cubic_pdf(sc, r);
 	else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
 		return bssrdf_gaussian_pdf(sc, r);
-	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
 		return bssrdf_burley_pdf(sc, r);
 }
 
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name)  KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE      (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+	int offsets[9];
+	int strides[9];
+	int x[4];
+	int y[4];
+	/* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+	ccl_global float *buffers[9];
+#else
+	long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..6226ed2c2ef
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW     } \
+                                 pixel_buffer += buffer_w - (high.x - low.x); \
+                             }
+
+ccl_device_inline void filter_get_features(int2 pixel,
+                                           const ccl_global float *ccl_restrict buffer,
+                                           float *features,
+                                           const float *ccl_restrict mean,
+                                           int pass_stride)
+{
+	features[0] = pixel.x;
+	features[1] = pixel.y;
+	features[2] = fabsf(ccl_get_feature(buffer, 0));
+	features[3] = ccl_get_feature(buffer, 1);
+	features[4] = ccl_get_feature(buffer, 2);
+	features[5] = ccl_get_feature(buffer, 3);
+	features[6] = ccl_get_feature(buffer, 4);
+	features[7] = ccl_get_feature(buffer, 5);
+	features[8] = ccl_get_feature(buffer, 6);
+	features[9] = ccl_get_feature(buffer, 7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] -= mean[i];
+	}
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+                                                 const ccl_global float *ccl_restrict buffer,
+                                                 float *scales,
+                                                 const float *ccl_restrict mean,
+                                                 int pass_stride)
+{
+	scales[0] = fabsf(pixel.x - mean[0]);
+	scales[1] = fabsf(pixel.y - mean[1]);
+	scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+	                                    ccl_get_feature(buffer, 2) - mean[4],
+	                                    ccl_get_feature(buffer, 3) - mean[5]));
+	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+	                                    ccl_get_feature(buffer, 6) - mean[8],
+	                                    ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+	scale[0] = 1.0f/max(scale[0], 0.01f);
+	scale[1] = 1.0f/max(scale[1], 0.01f);
+	scale[2] = 1.0f/max(scale[2], 0.01f);
+	scale[6] = 1.0f/max(scale[4], 0.01f);
+	scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+                                          int pass_stride)
+{
+	return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+                                      int rank,
+                                      const ccl_global float *ccl_restrict transform,
+                                      int stride,
+                                      int row,
+                                      float feature)
+{
+	for(int i = 0; i < rank; i++) {
+		design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+	}
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+                                                       const ccl_global float *ccl_restrict p_buffer,
+                                                       int2 q_pixel,
+                                                       const ccl_global float *ccl_restrict q_buffer,
+                                                       int pass_stride,
+                                                       int rank,
+                                                       float *design_row,
+                                                       const ccl_global float *ccl_restrict transform,
+                                                       int stride)
+{
+	design_row[0] = 1.0f;
+	math_vector_zero(design_row+1, rank);
+	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+	design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+	design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+	design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+	design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+	design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..3185330994c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE     } \
+                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+                                 }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
+                                               __m128 active_pixels,
+                                               const float *ccl_restrict buffer,
+                                               __m128 *features,
+                                               const __m128 *ccl_restrict mean,
+                                               int pass_stride)
+{
+	features[0] = x;
+	features[1] = y;
+	features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+	features[3] = ccl_get_feature_sse(1);
+	features[4] = ccl_get_feature_sse(2);
+	features[5] = ccl_get_feature_sse(3);
+	features[6] = ccl_get_feature_sse(4);
+	features[7] = ccl_get_feature_sse(5);
+	features[8] = ccl_get_feature_sse(6);
+	features[9] = ccl_get_feature_sse(7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] = _mm_sub_ps(features[i], mean[i]);
+	}
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
+                                                     __m128 active_pixels,
+                                                     const float *ccl_restrict buffer,
+                                                     __m128 *scales,
+                                                     const __m128 *ccl_restrict mean,
+                                                     int pass_stride)
+{
+	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
+
+	__m128 diff, scale;
+	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+	scale = _mm_mul_ps(diff, diff);
+	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	scales[3] = _mm_mask_ps(scale, active_pixels);
+
+	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+	scale = _mm_mul_ps(diff, diff);
+	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+#  include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+#  include "kernel/filter/filter_transform_gpu.h"
+#else
+#  ifdef __KERNEL_SSE3__
+#    include "kernel/filter/filter_transform_sse.h"
+#  else
+#    include "kernel/filter/filter_transform.h"
+#  endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+#  include "kernel/filter/filter_nlm_cpu.h"
+#else
+#  include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..3e752bce68f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+                                                         const float *ccl_restrict weight_image,
+                                                         const float *ccl_restrict variance_image,
+                                                         float *difference_image,
+                                                         int4 rect,
+                                                         int w,
+                                                         int channel_offset,
+                                                         float a,
+                                                         float k_2)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			float diff = 0.0f;
+			int numChannels = channel_offset? 3 : 1;
+			for(int c = 0; c < numChannels; c++) {
+				float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+				float pvar = variance_image[c*channel_offset + y*w+x];
+				float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+			}
+			if(numChannels > 1) {
+				diff *= 1.0f/numChannels;
+			}
+			difference_image[y*w+x] = diff;
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+                                              float *out_image,
+                                              int4 rect,
+                                              int w,
+                                              int f)
+{
+#ifdef __KERNEL_SSE3__
+	int aligned_lowx = (rect.x & ~(3));
+	int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+	for(int y = rect.y; y < rect.w; y++) {
+		const int low = max(rect.y, y-f);
+		const int high = min(rect.w, y+f+1);
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] = 0.0f;
+		}
+		for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+				_mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+			}
+#else
+			for(int x = rect.x; x < rect.z; x++) {
+				out_image[y*w+x] += difference_image[y1*w+x];
+			}
+#endif
+		}
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] *= 1.0f/(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int w,
+                                                     int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] = 0.0f;
+		}
+	}
+	for(int dx = -f; dx <= f; dx++) {
+		int pos_dx = max(0, dx);
+		int neg_dx = min(0, dx);
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+				out_image[y*w+x] += difference_image[y*w+dx+x];
+			}
+		}
+	}
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+                                                       const float *ccl_restrict difference_image,
+                                                       const float *ccl_restrict image,
+                                                       float *out_image,
+                                                       float *accum_image,
+                                                       int4 rect,
+                                                       int w,
+                                                       int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += difference_image[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+			accum_image[y*w+x] += weight;
+			out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+                                                           const float *ccl_restrict difference_image,
+                                                           const float *ccl_restrict buffer,
+                                                           float *transform,
+                                                           int *rank,
+                                                           float *XtWX,
+                                                           float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride)
+{
+	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+	for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+		int y = fy + filter_rect.y;
+		for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+			int x = fx + filter_rect.x;
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += difference_image[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+
+			int storage_ofs = fy*filter_rect.z + fx;
+			float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+			float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+			int    *l_rank = rank + storage_ofs;
+
+			kernel_filter_construct_gramian(x, y, 1,
+			                                dx, dy, w, h,
+			                                pass_stride,
+			                                buffer,
+			                                l_transform, l_rank,
+			                                weight, l_XtWX, l_XtWY, 0);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+                                                   const float *ccl_restrict accum_image,
+                                                   int4 rect,
+                                                   int w)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] /= accum_image[y*w+x];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..2c5ac807051
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+                                                         int dx, int dy,
+                                                         const ccl_global float *ccl_restrict weight_image,
+                                                         const ccl_global float *ccl_restrict variance_image,
+                                                         ccl_global float *difference_image,
+                                                         int4 rect, int w,
+                                                         int channel_offset,
+                                                         float a, float k_2)
+{
+	float diff = 0.0f;
+	int numChannels = channel_offset? 3 : 1;
+	for(int c = 0; c < numChannels; c++) {
+		float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+		float pvar = variance_image[c*channel_offset + y*w+x];
+		float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+	}
+	if(numChannels > 1) {
+		diff *= 1.0f/numChannels;
+	}
+	difference_image[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+                                              const ccl_global float *ccl_restrict difference_image,
+                                              ccl_global float *out_image,
+                                              int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.y, y-f);
+	const int high = min(rect.w, y+f+1);
+	for(int y1 = low; y1 < high; y1++) {
+		sum += difference_image[y1*w+x];
+	}
+	sum *= 1.0f/(high-low);
+	out_image[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+                                                     const ccl_global float *ccl_restrict difference_image,
+                                                     ccl_global float *out_image,
+                                                     int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+                                                       int dx, int dy,
+                                                       const ccl_global float *ccl_restrict difference_image,
+                                                       const ccl_global float *ccl_restrict image,
+                                                       ccl_global float *out_image,
+                                                       ccl_global float *accum_image,
+                                                       int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	if(out_image) {
+		accum_image[y*w+x] += sum;
+		out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+	}
+	else {
+		accum_image[y*w+x] = sum;
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+                                                           int dx, int dy,
+                                                           const ccl_global float *ccl_restrict difference_image,
+                                                           const ccl_global float *ccl_restrict buffer,
+                                                           const ccl_global float *ccl_restrict transform,
+                                                           ccl_global int *rank,
+                                                           ccl_global float *XtWX,
+                                                           ccl_global float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride,
+                                                           int localIdx)
+{
+	int y = fy + filter_rect.y;
+	int x = fx + filter_rect.x;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	float sum = 0.0f;
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*w+x1];
+	}
+	float weight = sum * (1.0f/(high - low));
+
+	int storage_ofs = fy*filter_rect.z + fx;
+	transform += storage_ofs;
+	rank += storage_ofs;
+	XtWX += storage_ofs;
+	XtWY += storage_ofs;
+
+	kernel_filter_construct_gramian(x, y,
+	                                filter_rect.z*filter_rect.w,
+	                                dx, dy, w, h,
+	                                pass_stride,
+	                                buffer,
+	                                transform, rank,
+	                                weight, XtWX, XtWY,
+	                                localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+                                                   ccl_global float *out_image,
+                                                   const ccl_global float *ccl_restrict accum_image,
+                                                   int4 rect, int w)
+{
+	out_image[y*w+x] /= accum_image[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..a0b89c1111f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int x, int y,
+                                            ccl_global float *unfilteredA,
+                                            ccl_global float *unfilteredB,
+                                            ccl_global float *sampleVariance,
+                                            ccl_global float *sampleVarianceV,
+                                            ccl_global float *bufferVariance,
+                                            int4 rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            bool use_split_variance)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+
+	int offset = tiles->offsets[tile];
+	int stride = tiles->strides[tile];
+	const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
+	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+	center_buffer += buffer_denoising_offset + 14;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+	unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+	unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+	float varA = center_buffer[2];
+	float varB = center_buffer[5];
+	int odd_sample = (sample+1)/2;
+	int even_sample = sample/2;
+	if(use_split_variance) {
+		varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+		varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+	}
+	varA /= max(odd_sample - 1, 1);
+	varB /= max(even_sample - 1, 1);
+
+	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
+	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+	bufferVariance[idx]  = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+                                          ccl_global TilesInfo *tiles,
+                                          int m_offset, int v_offset,
+                                          int x, int y,
+                                          ccl_global float *mean,
+                                          ccl_global float *variance,
+                                          int4 rect, int buffer_pass_stride,
+                                          int buffer_denoising_offset,
+                                          bool use_split_variance)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+	ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	mean[idx] = center_buffer[m_offset] / sample;
+	if (sample > 1) {
+		if(use_split_variance) {
+			variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+		}
+		else {
+			variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+		}
+	}
+	else {
+		/* Can't compute variance with single sample, just set it very high. */
+		variance[idx] = 1e10f;
+	}
+}
+
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+                                              ccl_global float *image,
+                                              ccl_global float *variance,
+                                              ccl_global float *depth,
+                                              ccl_global float *out,
+                                              int4 rect,
+                                              int pass_stride)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	int n = 0;
+	float values[25];
+	for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+		for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+			int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+			float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+			/* Find the position of L. */
+			int i;
+			for(i = 0; i < n; i++) {
+				if(values[i] > L) break;
+			}
+			/* Make space for L by shifting all following values to the right. */
+			for(int j = n; j > i; j--) {
+				values[j] = values[j-1];
+			}
+			/* Insert L. */
+			values[i] = L;
+			n++;
+		}
+	}
+
+	int idx = (y-rect.y)*buffer_w + (x-rect.x);
+	float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+	float ref = 2.0f*values[(int)(n*0.75f)];
+	float fac = 1.0f;
+	if(L > ref) {
+		/* The pixel appears to be an outlier.
+		 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+		 * should actually be at the reference value:
+		 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+		 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+		 */
+		float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+		if(L - 3*stddev < ref) {
+			/* The pixel is an outlier, so negate the depth value to mark it as one.
+			 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+			depth[idx] = -depth[idx];
+			fac = ref/L;
+			variance[idx              ] *= fac*fac;
+			variance[idx + pass_stride] *= fac*fac;
+			variance[idx+2*pass_stride] *= fac*fac;
+		}
+	}
+	out[idx              ] = fac*image[idx];
+	out[idx + pass_stride] = fac*image[idx + pass_stride];
+	out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+                                             ccl_global float *mean,
+                                             ccl_global float *variance,
+                                             ccl_global float *a,
+                                             ccl_global float *b,
+                                             int4 rect, int r)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
+	if(variance) {
+		if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+		else {
+			variance[idx] = 0.0f;
+			float values[25];
+			int numValues = 0;
+			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+					values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+				}
+			}
+			/* Insertion-sort the variances (fast enough for 25 elements). */
+			for(int i = 1; i < numValues; i++) {
+				float v = values[i];
+				int j;
+				for(j = i-1; j >= 0 && values[j] > v; j--)
+					values[j+1] = values[j];
+				values[j+1] = v;
+			}
+			variance[idx] = values[(7*numValues)/8];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..25a3025056c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+                                                       int storage_stride,
+                                                       int dx, int dy,
+                                                       int w, int h,
+                                                       int pass_stride,
+                                                       const ccl_global float *ccl_restrict buffer,
+                                                       const ccl_global float *ccl_restrict transform,
+                                                       ccl_global int *rank,
+                                                       float weight,
+                                                       ccl_global float *XtWX,
+                                                       ccl_global float3 *XtWY,
+                                                       int localIdx)
+{
+	if(weight < 1e-3f) {
+		return;
+	}
+
+	int p_offset =  y    *w +  x;
+	int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_GPU__
+	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+	float design_row[DENOISE_FEATURES+1];
+#endif
+
+	float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+
+	/* If the pixel was flagged as an outlier during prefiltering, skip it. */
+	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+		return;
+	}
+
+	filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
+	                                make_int2(x+dx, y+dy), buffer + q_offset,
+	                                pass_stride, *rank, design_row, transform, stride);
+
+	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+                                              ccl_global float *buffer,
+                                              ccl_global int *rank,
+                                              int storage_stride,
+                                              ccl_global float *XtWX,
+                                              ccl_global float3 *XtWY,
+                                              int4 buffer_params,
+                                              int sample)
+{
+#ifdef __KERNEL_GPU__
+	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+	if(XtWX[0] < 1e-3f) {
+		/* There is not enough information to determine a denoised result.
+		 * As a fallback, keep the original value of the pixel. */
+		 return;
+	}
+
+	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
+	 * In case the solution of the linear model fails due to numerical issues,
+	 * fall back to this value. */
+	float3 mean_color = XtWY[0]/XtWX[0];
+
+	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+	float3 final_color = XtWY[0];
+	if(!isfinite3_safe(final_color)) {
+		final_color = mean_color;
+	}
+
+	/* Clamp pixel value to positive values. */
+	final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+	final_color *= sample;
+	if(buffer_params.w) {
+		final_color.x += combined_buffer[buffer_params.w+0];
+		final_color.y += combined_buffer[buffer_params.w+1];
+		final_color.z += combined_buffer[buffer_params.w+2];
+	}
+	combined_buffer[0] = final_color.x;
+	combined_buffer[1] = final_color.y;
+	combined_buffer[2] = final_color.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..a5f87c05ec0
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float features[DENOISE_FEATURES];
+
+	/* Temporary storage, used in different steps of the algorithm. */
+	float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	float tempvector[2*DENOISE_FEATURES];
+	const float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float *feature_scale = tempvector;
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float* feature_matrix = tempmatrix;
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < (*rank); i++) {
+		math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..83a1222bbdb
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  ccl_global float *transform,
+                                                  ccl_global int *rank,
+                                                  int radius, float pca_threshold,
+                                                  int transform_stride, int localIdx)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+	float features[DENOISE_FEATURES];
+#endif
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+	const ccl_global float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float feature_scale[DENOISE_FEATURES];
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		for(int j = 0; j < (*rank); j++) {
+			transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..30dc2969b11
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	__m128 features[DENOISE_FEATURES];
+	const float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+	__m128 feature_means[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	__m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+	}
+
+	__m128 feature_scale[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	filter_calculate_scale_sse(feature_scale);
+
+	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 8888000f0e6..5c3b0ee3c15 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -565,7 +565,7 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I
 					r_ext = mw_extension + r_curr;
 #ifdef __KERNEL_SSE__
 					const float3 p_curr_sq = p_curr * p_curr;
-					const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128));
+					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
 					float d = dxxx.x;
 #else
 					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 47778553b94..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Interpolate smooth vertex normal from vertices */
 
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
 	/* load triangle vertices */
 	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
 	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 
-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+	float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+	return is_zero(N)? Ng: N;
 }
 
 /* Ray differentials on triangle */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 06c0fb2fbca..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
 #define KERNEL_ARCH cpu
 #include "kernel/kernels/cpu/kernel_cpu.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 823d30dde78..9ed16aceb55 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -220,8 +220,16 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #ifdef __SHADOW_TRICKS__
 	L->path_total = make_float3(0.0f, 0.0f, 0.0f);
 	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
-	L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_radiance_sum = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_throughput = 0.0f;
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_depth = 0.0f;
+#endif  /* __DENOISING_FEATURES__ */
 }
 
 ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +285,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 }
 
 ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              ccl_addr_space PathState *state,
                                               float3 throughput,
                                               float3 alpha,
                                               float3 bsdf,
-                                              float3 ao,
-                                              int bounce)
+                                              float3 ao)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf*ao;
 			L->ao += alpha*throughput*ao;
@@ -302,31 +310,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
 	}
 
 #ifdef __SHADOW_TRICKS__
-	float3 light = throughput * bsdf;
-	L->path_total += light;
-	L->path_total_shaded += ao * light;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf;
+		L->path_total += light;
+		L->path_total_shaded += ao * light;
+	}
 #endif
 }
 
 ccl_device_inline void path_radiance_accum_total_ao(
         PathRadiance *L,
+        ccl_addr_space PathState *state,
         float3 throughput,
         float3 bsdf)
 {
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * bsdf;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf;
+	}
 #else
 	(void) L;
+	(void) state;
 	(void) throughput;
 	(void) bsdf;
 #endif
 }
 
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+                                                 ccl_addr_space PathState *state,
+                                                 float3 throughput,
+                                                 BsdfEval *bsdf_eval,
+                                                 float3 shadow,
+                                                 float shadow_fac,
+                                                 bool is_lamp)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +372,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	}
 
 #ifdef __SHADOW_TRICKS__
-	float3 light = throughput * bsdf_eval->sum_no_mis;
-	L->path_total += light;
-	L->path_total_shaded += shadow * light;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf_eval->sum_no_mis;
+		L->path_total += light;
+		L->path_total_shaded += shadow * light;
+	}
 #endif
 }
 
 ccl_device_inline void path_radiance_accum_total_light(
         PathRadiance *L,
+        ccl_addr_space PathState *state,
         float3 throughput,
         const BsdfEval *bsdf_eval)
 {
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * bsdf_eval->sum_no_mis;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf_eval->sum_no_mis;
+	}
 #else
 	(void) L;
+	(void) state;
 	(void) throughput;
 	(void) bsdf_eval;
 #endif
@@ -393,11 +419,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
 	}
 
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * value;
-	if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
-		L->path_total_shaded += throughput * value;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * value;
+		if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+			L->path_total_shaded += throughput * value;
+		}
 	}
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_albedo += state->denoising_feature_weight * value;
+#endif  /* __DENOISING_FEATURES__ */
 }
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,29 +587,79 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 	return L_sum;
 }
 
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+	kernel_assert(L->use_light_pass);
+
+	*clean = L->emission + L->background;
+	*noisy = L->direct_scatter + L->indirect_scatter;
+
+#  define ADD_COMPONENT(flag, component)     \
+	if(kernel_data.film.denoising_flags & flag) \
+		*clean += component;                 \
+	else                                     \
+		*noisy += component;
+
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR,      L->direct_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND,      L->indirect_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR,       L->direct_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND,       L->indirect_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR,   L->direct_subsurface);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND,   L->indirect_subsurface);
+#  undef ADD_COMPONENT
+#else
+	*noisy = L->emission;
+	*clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+	*noisy = ensure_finite3(*noisy);
+	*clean = ensure_finite3(*clean);
+}
+
 ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
 {
 	float fac = 1.0f/num_samples;
 
+#ifdef __SPLIT_KERNEL__
+#  define safe_float3_add(f, v) \
+	do { \
+		ccl_global float *p = (ccl_global float*)(&(f)); \
+		atomic_add_and_fetch_float(p+0, (v).x); \
+		atomic_add_and_fetch_float(p+1, (v).y); \
+		atomic_add_and_fetch_float(p+2, (v).z); \
+	} while(0)
+#else
+#  define safe_float3_add(f, v) (f) += (v)
+#endif  /* __SPLIT_KERNEL__ */
+
 #ifdef __PASSES__
-	L->direct_diffuse += L_sample->direct_diffuse*fac;
-	L->direct_glossy += L_sample->direct_glossy*fac;
-	L->direct_transmission += L_sample->direct_transmission*fac;
-	L->direct_subsurface += L_sample->direct_subsurface*fac;
-	L->direct_scatter += L_sample->direct_scatter*fac;
-
-	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
-	L->indirect_glossy += L_sample->indirect_glossy*fac;
-	L->indirect_transmission += L_sample->indirect_transmission*fac;
-	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
-	L->indirect_scatter += L_sample->indirect_scatter*fac;
-
-	L->background += L_sample->background*fac;
-	L->ao += L_sample->ao*fac;
-	L->shadow += L_sample->shadow*fac;
+	safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
+	safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
+	safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
+	safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
+	safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
+
+	safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
+	safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
+	safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
+	safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
+	safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
+
+	safe_float3_add(L->background, L_sample->background*fac);
+	safe_float3_add(L->ao, L_sample->ao*fac);
+	safe_float3_add(L->shadow, L_sample->shadow*fac);
+#  ifdef __SPLIT_KERNEL__
+	atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
+#  else
 	L->mist += L_sample->mist*fac;
-#endif
-	L->emission += L_sample->emission * fac;
+#  endif  /* __SPLIT_KERNEL__ */
+#endif  /* __PASSES__ */
+	safe_float3_add(L->emission, L_sample->emission*fac);
+
+#undef safe_float3_add
 }
 
 #ifdef __SHADOW_TRICKS__
@@ -595,16 +677,17 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
 /* Calculate final light sum and transparency for shadow catcher object. */
 ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
                                                          const PathRadiance *L,
-                                                         ccl_addr_space float* L_transparent)
+                                                         float* alpha)
 {
 	const float shadow = path_radiance_sum_shadow(L);
 	float3 L_sum;
 	if(kernel_data.background.transparent) {
-		*L_transparent = shadow;
-		L_sum = make_float3(0.0f, 0.0f, 0.0f);
+		*alpha = 1.0f - L->shadow_throughput * shadow;
+		L_sum = L->shadow_radiance_sum;
 	}
 	else {
-		L_sum = L->shadow_color * shadow;
+		L_sum = L->shadow_background_color * L->shadow_throughput * shadow +
+		        L->shadow_radiance_sum;
 	}
 	return L_sum;
 }
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 21da180bb8e..93934ee6b38 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -195,7 +195,7 @@ template<typename T> struct texture_image  {
 					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
 						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 					}
-					/* Fall through. */
+					ATTR_FALLTHROUGH;
 				case EXTENSION_EXTEND:
 					ix = wrap_clamp(ix, width);
 					iy = wrap_clamp(iy, height);
@@ -222,7 +222,7 @@ template<typename T> struct texture_image  {
 					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
 						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 					}
-					/* Fall through. */
+					ATTR_FALLTHROUGH;
 				case EXTENSION_EXTEND:
 					nix = wrap_clamp(ix+1, width);
 					niy = wrap_clamp(iy+1, height);
@@ -265,7 +265,7 @@ template<typename T> struct texture_image  {
 					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
 						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 					}
-					/* Fall through. */
+					ATTR_FALLTHROUGH;
 				case EXTENSION_EXTEND:
 					pix = wrap_clamp(ix-1, width);
 					piy = wrap_clamp(iy-1, height);
@@ -335,7 +335,7 @@ template<typename T> struct texture_image  {
 				{
 					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				}
-				/* Fall through. */
+				ATTR_FALLTHROUGH;
 			case EXTENSION_EXTEND:
 				ix = wrap_clamp(ix, width);
 				iy = wrap_clamp(iy, height);
@@ -374,7 +374,7 @@ template<typename T> struct texture_image  {
 				{
 					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				}
-				/* Fall through. */
+				ATTR_FALLTHROUGH;
 			case EXTENSION_EXTEND:
 				nix = wrap_clamp(ix+1, width);
 				niy = wrap_clamp(iy+1, height);
@@ -449,7 +449,7 @@ template<typename T> struct texture_image  {
 				{
 					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				}
-				/* Fall through. */
+				ATTR_FALLTHROUGH;
 			case EXTENSION_EXTEND:
 				pix = wrap_clamp(ix-1, width);
 				piy = wrap_clamp(iy-1, height);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index c375d17a95f..38708f7ff0b 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,11 @@
 #define ccl_restrict __restrict__
 #define ccl_align(n) __align__(n)
 
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index c2263ac0d49..4836c290312 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
 #  define ccl_addr_space
 #endif
 
+#define ATTR_FALLTHROUGH
+
 #define ccl_local_id(d) get_local_id(d)
 #define ccl_global_id(d) get_global_id(d)
 
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index c9c97ea977e..f95f0d98c52 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -19,6 +19,10 @@
 #ifndef __KERNEL_GLOBALS_H__
 #define __KERNEL_GLOBALS_H__
 
+#ifdef __KERNEL_CPU__
+#  include "util/util_vector.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -38,12 +42,12 @@ struct Intersection;
 struct VolumeStep;
 
 typedef struct KernelGlobals {
-	texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU];
-	texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU];
-	texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU];
-	texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU];
-	texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU];
-	texture_image_half texture_half_images[TEX_NUM_HALF_CPU];
+	vector<texture_image_float4> texture_float4_images;
+	vector<texture_image_uchar4> texture_byte4_images;
+	vector<texture_image_half4> texture_half4_images;
+	vector<texture_image_float> texture_float_images;
+	vector<texture_image_uchar> texture_byte_images;
+	vector<texture_image_half> texture_half_images;
 
 #  define KERNEL_TEX(type, ttype, name) ttype name;
 #  define KERNEL_IMAGE_TEX(type, ttype, name)
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
index 0352c58037d..90747e09357 100644
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -20,18 +20,19 @@
 
 ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
 {
+	const int texture_type = kernel_tex_type(id);
 	/* Float4 */
-	if(id < TEX_START_BYTE4_OPENCL) {
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
 		return kernel_tex_fetch(__tex_image_float4_packed, offset);
 	}
 	/* Byte4 */
-	else if(id < TEX_START_FLOAT_OPENCL) {
+	else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
 		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
 		float f = 1.0f/255.0f;
 		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
 	}
 	/* Float */
-	else if(id < TEX_START_BYTE_OPENCL) {
+	else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
 		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
 		return make_float4(f, f, f, 1.0f);
 	}
@@ -63,23 +64,34 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 	return x - (float)i;
 }
 
+ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
+{
+	return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+}
+
+ccl_device_inline uint kernel_decode_image_extension(uint4 info)
+{
+	if(info.w & (1 << 1)) {
+		return EXTENSION_REPEAT;
+	}
+	else if(info.w & (1 << 2)) {
+		return EXTENSION_EXTEND;
+	}
+	else {
+		return EXTENSION_CLIP;
+	}
+}
+
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
 	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
 	uint width = info.x;
 	uint height = info.y;
 	uint offset = info.z;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
+	/* Decode image options. */
+	uint interpolation = kernel_decode_image_interpolation(info);
+	uint extension = kernel_decode_image_extension(info);
+	/* Actual sampling. */
 	float4 r;
 	int ix, iy, nix, niy;
 	if(interpolation == INTERPOLATION_CLOSEST) {
@@ -132,7 +144,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 		r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
 		r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
 	}
-
 	return r;
 }
 
@@ -144,17 +155,10 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 	uint height = info.y;
 	uint offset = info.z;
 	uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
+	/* Decode image options. */
+	uint interpolation = kernel_decode_image_interpolation(info);
+	uint extension = kernel_decode_image_extension(info);
+	/* Actual sampling. */
 	float4 r;
 	int ix, iy, iz, nix, niy, niz;
 	if(interpolation == INTERPOLATION_CLOSEST) {
@@ -171,7 +175,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 			if(extension == EXTENSION_CLIP) {
 				if(x < 0.0f || y < 0.0f || z < 0.0f ||
 				   x > 1.0f || y > 1.0f || z > 1.0f)
-				 {
+				{
 					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				}
 			}
@@ -198,12 +202,13 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 			niz = svm_image_texture_wrap_periodic(iz+1, depth);
 		}
 		else {
-			if(extension == EXTENSION_CLIP)
+			if(extension == EXTENSION_CLIP) {
 				if(x < 0.0f || y < 0.0f || z < 0.0f ||
 				   x > 1.0f || y > 1.0f || z > 1.0f)
 				{
 					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				}
+			}
 			/* Fall through. */
 			/*  EXTENSION_EXTEND */
 			nix = svm_image_texture_wrap_clamp(ix+1, width);
@@ -224,8 +229,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 		r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height);
 		r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height);
 		r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height);
-
 	}
-
 	return r;
 }
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 67546131746..f5855757d3f 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p)
 	return (x + jx)*invN;
 }
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
+ccl_device_inline int cmj_isqrt(int value)
 {
-	kernel_assert(s < N);
-
 #if defined(__KERNEL_CUDA__)
-	int m = float_to_int(__fsqrt_ru(N));
+	return float_to_int(__fsqrt_ru(value));
+#elif defined(__KERNEL_GPU__)
+	return float_to_int(sqrtf(value));
 #else
-	int m = float_to_int(sqrtf(N));
+	/* This is a work around for fast-math on CPU which might replace sqrtf()
+	 * with am approximated version.
+	 */
+	return float_to_int(sqrtf(value) + 1e-6f);
 #endif
+}
+
+ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+{
+	kernel_assert(s < N);
+
+	int m = cmj_isqrt(N);
 	int n = (N - 1)/m + 1;
 	float invN = 1.0f/N;
 	float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..9baa9d54957 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
 		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
 		cu = clamp(cu, -1.0f, 1.0f);
 		/* Compute xu. */
-		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
 		xu = clamp(xu, x0, x1);
 		/* Compute yv. */
 		float z0sq = z0 * z0;
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index ed523696571..9cd7ffb181d 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,140 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 #endif  /* __SPLIT_KERNEL__ */
 }
 
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+	kernel_write_pass_float(buffer, sample, value);
+
+	/* The online one-pass variance update that's used for the megakernel can't easily be implemented
+	 * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float(buffer+1, sample, value*value);
+#  else
+	if(sample == 0) {
+		kernel_write_pass_float(buffer+1, sample, 0.0f);
+	}
+	else {
+		float new_mean = buffer[0] * (1.0f / (sample + 1));
+		float old_mean = (buffer[0] - value) * (1.0f / sample);
+		kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+	}
+#  endif
+}
+
+#  if defined(__SPLIT_KERNEL__)
+#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+#  else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+	buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+	buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+	buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+#  endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+	kernel_write_pass_float3_unaligned(buffer, sample, value);
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+#  else
+	if(sample == 0) {
+		kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+	}
+	else {
+		float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+		float3 new_mean = sum * (1.0f / (sample + 1));
+		float3 old_mean = (sum - value) * (1.0f / sample);
+		kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+	}
+#  endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, float path_total, float path_total_shaded)
+{
+	if(kernel_data.film.pass_denoising_data == 0)
+		return;
+
+	buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+	path_total = ensure_finite(path_total);
+	path_total_shaded = ensure_finite(path_total_shaded);
+
+	kernel_write_pass_float(buffer, sample/2, path_total);
+	kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+	float value = path_total_shaded / max(path_total, 1e-7f);
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float(buffer+2, sample/2, value*value);
+#  else
+	if(sample < 2) {
+		kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+	}
+	else {
+		float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+		float new_value = buffer[1] / max(buffer[0], 1e-7f);
+		kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+	}
+#  endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        ccl_addr_space PathState *state,
+                                                        PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight == 0.0f) {
+		return;
+	}
+
+	L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+	/* Skip implicitly transparent surfaces. */
+	if(sd->flag & SD_HAS_ONLY_VOLUME) {
+		return;
+	}
+
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+	float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			continue;
+
+		/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+		normal += sc->N * sc->sample_weight;
+		sum_weight += sc->sample_weight;
+		if(!bsdf_is_specular_like(sc)) {
+			albedo += sc->weight;
+			sum_nonspecular_weight += sc->sample_weight;
+		}
+	}
+
+	/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+	if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+		if(sum_weight != 0.0f) {
+			normal /= sum_weight;
+		}
+		L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+		L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+		state->denoising_feature_weight = 0.0f;
+	}
+#else
+	(void) kg;
+	(void) sd;
+	(void) state;
+	(void) L;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
@@ -199,5 +333,88 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
 #endif
 }
 
+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+	if(L) {
+		float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+		if(is_shadow_catcher) {
+			L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+		}
+		else
+#endif  /* __SHADOW_TRICKS__ */
+		{
+			L_sum = path_radiance_clamp_and_sum(kg, L);
+		}
+
+		kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+		kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+		if(kernel_data.film.pass_denoising_data) {
+#  ifdef __SHADOW_TRICKS__
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+#  else
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+#  endif
+			if(kernel_data.film.pass_denoising_clean) {
+				float3 noisy, clean;
+#ifdef __SHADOW_TRICKS__
+				if(is_shadow_catcher) {
+					noisy = L_sum;
+					clean = make_float3(0.0f, 0.0f, 0.0f);
+				}
+				else
+#endif  /* __SHADOW_TRICKS__ */
+				{
+					path_radiance_split_denoising(kg, L, &noisy, &clean);
+				}
+				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+				                                  sample, noisy);
+				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+				                                   sample, clean);
+			}
+			else {
+				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+				                                  sample, ensure_finite3(L_sum));
+			}
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+			                                  sample, L->denoising_normal);
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+			                                  sample, L->denoising_albedo);
+			kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+			                                 sample, L->denoising_depth);
+		}
+#endif  /* __DENOISING_FEATURES__ */
+	}
+	else {
+		kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+		if(kernel_data.film.pass_denoising_data) {
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+			                                 sample, 0.0f);
+
+			if(kernel_data.film.pass_denoising_clean) {
+				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+				                                   sample, make_float3(0.0f, 0.0f, 0.0f));
+			}
+		}
+#endif  /* __DENOISING_FEATURES__ */
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index e7957042182..c340b3bc968 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
+                                        ccl_addr_space PathState *state,
                                         RNG *rng,
                                         float3 throughput,
                                         float3 ao_alpha)
@@ -90,14 +90,16 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
 		}
 		else {
-			path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+			path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -364,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -403,7 +407,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		}
 #endif  /* __SUBSURFACE__ */
 
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+#if defined(__EMISSION__)
 		if(kernel_data.integrator.use_direct_light) {
 			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
 			          (state->flag & PATH_RAY_SHADOW_CATCHER);
@@ -417,7 +421,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                           L,
 			                                           all);
 		}
-#endif  /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
+#endif  /* defined(__EMISSION__) */
 
 		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
 			break;
@@ -425,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 }
 
 
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
-                                               RNG *rng,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+                                              RNG *rng,
+                                              int sample,
+                                              Ray ray,
+                                              ccl_global float *buffer,
+                                              PathRadiance *L,
+                                              bool *is_shadow_catcher)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -515,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			float3 emission;
 
 			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
-				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __LAMP_MIS__ */
 
@@ -547,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 				/* emission */
 				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
 
 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -557,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &sd,
-						&emission_sd, throughput, &state, &L, all,
+						&emission_sd, throughput, &state, L, all,
 						&volume_ray, &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
@@ -575,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
@@ -589,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+					kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
 
 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
@@ -621,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, &state, throughput, L_background);
+			path_radiance_accum_background(L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -638,11 +643,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __SHADOW_TRICKS__
 		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.flag |= (PATH_RAY_SHADOW_CATCHER |
+				               PATH_RAY_SHADOW_CATCHER_ONLY |
+				               PATH_RAY_STORE_SHADOW_INFO);
 				state.catcher_object = sd.object;
 				if(!kernel_data.background.transparent) {
-					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+					L->shadow_background_color =
+					        indirect_background(kg, &emission_sd, &state, &ray);
 				}
+				L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+				L->shadow_throughput = average(throughput);
 			}
 		}
 		else {
@@ -675,7 +685,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
 
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
@@ -693,7 +703,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_EMISSION) {
 			/* todo: is isect.t wrong here for transparent surfaces? */
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+			path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __EMISSION__ */
 
@@ -713,10 +723,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+			kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif  /* __AO__ */
 
@@ -727,7 +739,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
 			                                  &emission_sd,
-			                                  &L,
+			                                  L,
 			                                  &state,
 			                                  rng,
 			                                  &ray,
@@ -740,15 +752,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 
 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
 
 		/* compute direct lighting and next bounce */
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 			break;
 	}
 
 #ifdef __SUBSURFACE__
-		kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+		kernel_path_subsurface_accum_indirect(&ss_indirect, L);
 
 		/* Trace indirect subsurface rays by restarting the loop. this uses less
 		 * stack memory than invoking kernel_path_indirect.
@@ -758,7 +770,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			                                      &ss_indirect,
 			                                      &state,
 			                                      &ray,
-			                                      &L,
+			                                      L,
 			                                      &throughput);
 		}
 		else {
@@ -767,24 +779,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 	}
 #endif  /* __SUBSURFACE__ */
 
-	float3 L_sum;
 #ifdef __SHADOW_TRICKS__
-	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
-		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
-	}
-	else
+	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
 #endif  /* __SHADOW_TRICKS__ */
-	{
-		L_sum = path_radiance_clamp_and_sum(kg, &L);
-	}
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif  /* __KERNEL_DEBUG__ */
 
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+	return 1.0f - L_transparent;
 }
 
 ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -805,18 +808,21 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	PathRadiance L;
+	bool is_shadow_catcher;
 
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	if(ray.t != 0.0f) {
+		float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+	}
+	else {
+		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	}
 
 	path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 36fd6c95fe7..77d4f1df447 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
+                                               ccl_addr_space PathState *state,
                                                RNG *rng,
                                                float3 throughput)
 {
@@ -56,29 +56,48 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 			light_ray.dD = differential3_zero();
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
 			}
 			else {
-				path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+				path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
 			}
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
 	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	for(int i = 0; i < sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];
 
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
 		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+		if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
 			continue;
+		}
 
 		int num_samples;
 
@@ -110,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			                                        &tp,
 			                                        &ps,
 			                                        L,
-			                                        &bsdf_ray))
+			                                        &bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
@@ -242,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */
 
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+                                                RNG *rng,
+                                                int sample,
+                                                Ray ray,
+                                                ccl_global float *buffer,
+                                                PathRadiance *L,
+                                                bool *is_shadow_catcher)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -329,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				int all = kernel_data.integrator.sample_all_lights_direct;
 
 				kernel_branched_path_volume_connect_light(kg, rng, &sd,
-					&emission_sd, throughput, &state, &L, all,
+					&emission_sd, throughput, &state, L, all,
 					&volume_ray, &volume_segment);
 
 				/* indirect light sampling */
@@ -337,11 +362,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				float num_samples_inv = 1.0f/num_samples;
 
 				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
 					PathState ps = state;
 					Ray pray = ray;
 					float3 tp = throughput;
@@ -352,8 +372,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					/* scatter sample. if we use distance sampling and take just one
 					 * sample for direct and indirect light, we could share this
 					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE);
 
 					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
@@ -366,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             &L,
+					                             L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
@@ -377,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						                     tp*num_samples_inv,
 						                     num_samples,
 						                     &ps,
-						                     &L);
+						                     L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+						path_radiance_sum_indirect(L);
+						path_radiance_reset_indirect(L);
 					}
 				}
 			}
 
 			/* emission and transmittance */
 			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+				path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
 			throughput *= volume_segment.accum_transmittance;
 
 			/* free cached steps */
@@ -411,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				path_state_branch(&ps, j, num_samples);
 
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+					kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
 
 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: support equiangular, MIS and all light sampling.
 					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
 
 					if(kernel_path_volume_bounce(kg,
 					                             rng,
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             &L,
+					                             L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
@@ -435,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						                     tp,
 						                     num_samples,
 						                     &ps,
-						                     &L);
+						                     L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+						path_radiance_sum_indirect(L);
+						path_radiance_reset_indirect(L);
 					}
 				}
 #endif  /* __VOLUME_SCATTER__ */
@@ -466,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, &state, throughput, L_background);
+			path_radiance_accum_background(L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -479,13 +499,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __SHADOW_TRICKS__
 		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
-				state.catcher_object = sd.object;
-				if(!kernel_data.background.transparent) {
-					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
-				}
+			state.flag |= (PATH_RAY_SHADOW_CATCHER |
+			               PATH_RAY_SHADOW_CATCHER_ONLY |
+			               PATH_RAY_STORE_SHADOW_INFO);
+			state.catcher_object = sd.object;
+			if(!kernel_data.background.transparent) {
+				L->shadow_background_color =
+				        indirect_background(kg, &emission_sd, &state, &ray);
 			}
+			L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+			L->shadow_throughput = average(throughput);
 		}
 		else {
 			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
@@ -513,13 +536,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
 
 #ifdef __EMISSION__
 		/* emission */
 		if(sd.flag & SD_EMISSION) {
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+			path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __EMISSION__ */
 
@@ -543,10 +566,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
 		}
 #endif  /* __AO__ */
 
@@ -554,7 +579,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        &L, &state, rng, &ray, throughput);
+			                                        L, &state, rng, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */
 
@@ -567,13 +592,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				int all = (kernel_data.integrator.sample_all_lights_direct) ||
 				          (state.flag & PATH_RAY_SHADOW_CATCHER);
 				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+					&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
 			}
 #endif  /* __EMISSION__ */
 
 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
 
 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
@@ -602,24 +627,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __VOLUME__ */
 	}
 
-	float3 L_sum;
 #ifdef __SHADOW_TRICKS__
-	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
-		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
-	}
-	else
+	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
 #endif  /* __SHADOW_TRICKS__ */
-	{
-		L_sum = path_radiance_clamp_and_sum(kg, &L);
-	}
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif  /* __KERNEL_DEBUG__ */
 
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+	return 1.0f - L_transparent;
 }
 
 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -640,20 +656,22 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	PathRadiance L;
+	bool is_shadow_catcher;
 
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	if(ray.t != 0.0f) {
+		float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+	}
+	else {
+		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	}
 
 	path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c0cd2a63120..5d92fd12201 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 	state->transmission_bounce = 0;
 	state->transparent_bounce = 0;
 
+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+		state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+		state->denoising_feature_weight = 1.0f;
+	}
+	else {
+		state->denoising_feature_weight = 0.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	state->min_ray_pdf = FLT_MAX;
 	state->ray_pdf = 0.0f;
 #ifdef __LAMP_MIS__
@@ -128,6 +138,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 
 	/* random number generator next bounce */
 	state->rng_offset += PRNG_BOUNCE_NUM;
+
+#ifdef __DENOISING_FEATURES__
+	if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+		state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+	}
+#endif
 }
 
 ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 076c82f3853..dcb577e176f 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__)
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
 ccl_device_noinline void kernel_branched_path_surface_connect_light(
         KernelGlobals *kg,
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 						else {
-							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 						else {
-							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
 				}
 				else {
-					path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+					path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
 				}
 			}
 		}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
         PathRadiance *L,
-        Ray *ray)
+        ccl_addr_space Ray *ray,
+        float sum_sample_weight)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
 	/* modify throughput */
 	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
 
+#ifdef __DENOISING_FEATURES__
+	state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
 	/* modify path state */
 	path_state_next(kg, state, label);
 
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 			else {
-				path_radiance_accum_total_light(L, throughput, &L_light);
+				path_radiance_accum_total_light(L, state, throughput, &L_light);
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 371f2c1c7cb..dcedf51e479 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 		}
 	}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
 				}
 			}
 		}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
 
 ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
+	if(is_zero(dir))
+		return make_float2(0.0f, 0.0f);
+
 	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
 	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
 
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 96bc636d5ac..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index(
 	return my_gqidx;
 }
 
+ccl_device int dequeue_ray_index(
+        int queue_number,
+        ccl_global int *queues,
+        int queue_size,
+        ccl_global int *queue_index)
+{
+	int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+	if(index < 0) {
+		return QUEUE_EMPTY_SLOT;
+	}
+
+	return queues[index + queue_number * queue_size];
+}
+
 CCL_NAMESPACE_END
 
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index d4f0caff5de..e8a912ccc0b 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -20,14 +20,15 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __SOBOL__
 
-/* skip initial numbers that are not as well distributed, especially the
+/* Skip initial numbers that are not as well distributed, especially the
  * first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination */
+ * path termination.
+ */
 #define SOBOL_SKIP 64
 
-/* High Dimensional Sobol */
+/* High Dimensional Sobol. */
 
-/* van der corput radical inverse */
+/* Van der Corput radical inverse. */
 ccl_device uint van_der_corput(uint bits)
 {
 	bits = (bits << 16) | (bits >> 16);
@@ -38,58 +39,63 @@ ccl_device uint van_der_corput(uint bits)
 	return bits;
 }
 
-/* sobol radical inverse */
+/* Sobol radical inverse. */
 ccl_device uint sobol(uint i)
 {
 	uint r = 0;
-
-	for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1)
-		if(i & 1)
+	for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) {
+		if(i & 1) {
 			r ^= v;
-
+		}
+	}
 	return r;
 }
 
-/* inverse of sobol radical inverse */
+/* Inverse of sobol radical inverse. */
 ccl_device uint sobol_inverse(uint i)
 {
 	const uint msb = 1U << 31;
 	uint r = 0;
-
-	for(uint v = 1; i; i <<= 1, v ^= v << 1)
-		if(i & msb)
+	for(uint v = 1; i; i <<= 1, v ^= v << 1) {
+		if(i & msb) {
 			r ^= v;
-
+		}
+	}
 	return r;
 }
 
-/* multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
+/* Multidimensional sobol with generator matrices
+ * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively.
+ */
 ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 {
 	uint result = 0;
 	uint i = index;
-
-	for(uint j = 0; i; i >>= 1, j++)
-		if(i & 1)
+	for(uint j = 0; i; i >>= 1, j++) {
+		if(i & 1) {
 			result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j);
-	
+		}
+	}
 	return result;
 }
 
-/* lookup index and x/y coordinate, assumes m is a power of two */
-ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
+/* Lookup index and x/y coordinate, assumes m is a power of two. */
+ccl_device uint sobol_lookup(const uint m,
+                             const uint frame,
+                             const uint ex,
+                             const uint ey,
+                             uint *x, uint *y)
 {
-	/* shift is constant per frame */
+	/* Shift is constant per frame. */
 	const uint shift = frame << (m << 1);
 	const uint sobol_shift = sobol(shift);
-	/* van der Corput is its own inverse */
+	/* Van der Corput is its own inverse. */
 	const uint lower = van_der_corput(ex << (32 - m));
-	/* need to compensate for ey difference and shift */
+	/* Need to compensate for ey difference and shift. */
 	const uint sobol_lower = sobol(lower);
-	const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */
+	const uint mask = ~-(1 << m) << (32 - m);  /* Only m upper bits. */
 	const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
-	/* only use m upper bits for the index (m is a power of two) */
+	/* Only use m upper bits for the index (m is a power of two). */
 	const uint sobol_result = delta | (delta >> m);
 	const uint upper = sobol_inverse(sobol_result);
 	const uint index = shift | upper | lower;
@@ -98,11 +104,14 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+                                         RNG *rng,
+                                         int sample, int num_samples,
+                                         int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
+		/* Correlated multi-jitter. */
 		int p = *rng + dimension;
 		return cmj_sample_1D(sample, num_samples, p);
 	}
@@ -113,7 +122,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
 	return r;
 #else
-	/* compute sobol sequence value using direction vectors */
+	/* Compute sobol sequence value using direction vectors. */
 	uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
 
@@ -130,24 +139,33 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
+                                        RNG *rng,
+                                        int sample, int num_samples,
+                                        int dimension,
+                                        float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
+		/* Correlated multi-jitter. */
 		int p = *rng + dimension;
 		cmj_sample_2D(sample, num_samples, p, fx, fy);
 	}
 	else
 #endif
 	{
-		/* sobol */
+		/* Sobol. */
 		*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
 		*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
 	}
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg,
+                                     ccl_global uint *rng_state,
+                                     int sample, int num_samples,
+                                     RNG *rng,
+                                     int x, int y,
+                                     float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -182,29 +200,43 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_sta
 #endif
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg,
+                             ccl_global uint *rng_state,
+                             RNG rng)
 {
 	/* nothing to do */
 }
 
-#else
+#else  /* __SOBOL__ */
 
 /* Linear Congruential Generator */
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+                                         RNG *rng,
+                                         int sample, int num_samples,
+                                         int dimension)
 {
 	/* implicit mod 2^32 */
 	*rng = (1103515245*(*rng) + 12345);
 	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
 
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg,
+                                   RNG *rng,
+                                   int sample, int num_samples,
+                                   int dimension,
+                                   float *fx, float *fy)
 {
 	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
 	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
 }
 
-ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device void path_rng_init(KernelGlobals *kg,
+                              ccl_global uint *rng_state,
+                              int sample, int num_samples,
+                              RNG *rng,
+                              int x, int y,
+                              float *fx, float *fy)
 {
 	/* load state */
 	*rng = *rng_state;
@@ -220,13 +252,15 @@ ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int
 	}
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg,
+                             ccl_global uint *rng_state,
+                             RNG rng)
 {
 	/* store state for next sample */
 	*rng_state = rng;
 }
 
-#endif
+#endif  /* __SOBOL__ */
 
 /* Linear Congruential Generator */
 
@@ -257,49 +291,108 @@ ccl_device uint lcg_init(uint seed)
  * dimension to avoid using the same sequence twice.
  *
  * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly. */
+ * in a sequence and offset accordingly.
+ */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
+                                          RNG *rng,
+                                          const ccl_addr_space PathState *state,
+                                          int dimension)
 {
-	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
+	return path_rng_1D(kg,
+	                   rng,
+	                   state->sample, state->num_samples,
+	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(
+        KernelGlobals *kg,
+        RNG *rng,
+        const ccl_addr_space PathState *state,
+        int dimension)
 {
-	/* the rng_offset is not increased for transparent bounces. if we do then
+	/* The rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
 	 * sampling patterns used where the transparent object is.
 	 *
 	 * however for some random numbers that will determine if we next bounce
 	 * is transparent we do need to increase the offset to avoid always making
-	 * the same decision */
-	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
+	 * the same decision. */
+	const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
+	return path_rng_1D(kg,
+	                   rng,
+	                   state->sample, state->num_samples,
+	                   rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
+                                         RNG *rng,
+                                         const ccl_addr_space PathState *state,
+                                         int dimension,
+                                         float *fx, float *fy)
 {
-	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
+	path_rng_2D(kg,
+	            rng,
+	            state->sample, state->num_samples,
+	            state->rng_offset + dimension,
+	            fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(
+        KernelGlobals *kg,
+        RNG *rng,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension)
 {
-	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
+	return path_rng_1D(kg,
+	                   rng,
+	                   state->sample * num_branches + branch,
+	                   state->num_samples * num_branches,
+	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(
+        KernelGlobals *kg,
+        RNG *rng,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension)
 {
-	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+	const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
+	return path_rng_1D(kg,
+	                   rng,
+	                   state->sample * num_branches + branch,
+	                   state->num_samples * num_branches,
+	                   rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(
+        KernelGlobals *kg,
+        RNG *rng,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension,
+        float *fx, float *fy)
 {
-	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
+	path_rng_2D(kg,
+	            rng,
+	            state->sample * num_branches + branch,
+	            state->num_samples * num_branches,
+	            state->rng_offset + dimension,
+	            fx, fy);
 }
 
-/* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state)
+/* Utitility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(
+        KernelGlobals *kg,
+        RNG *rng,
+        const ccl_addr_space PathState *state)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
 		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
@@ -307,15 +400,27 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG
 	return 0.0f;
 }
 
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(
+        KernelGlobals *kg,
+        RNG *rng,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+		return path_branched_rng_1D_for_decision(kg,
+		                                         rng,
+		                                         state,
+		                                         branch,
+		                                         num_branches,
+		                                         PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches)
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+                                         int branch,
+                                         int num_branches)
 {
 	/* path is splitting into a branch, adjust so that each branch
 	 * still gets a unique sample from the same sequence */
@@ -324,14 +429,17 @@ ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int br
 	state->num_samples = state->num_samples*num_branches;
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng,
+                                      int rng_offset,
+                                      int sample,
+                                      uint scramble)
 {
 	return lcg_init(*rng + rng_offset + sample*scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
 {
-	/* implicit mod 2^32 */
+	/* Implicit mod 2^32 */
 	*rng = (1103515245*(*rng) + 12345);
 	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 8c0c5e90a3e..c66f52255f0 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		
 		/* smooth normal */
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
 		sd->N = Ng;
 
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #  ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
 		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
 			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index db6f839d9ed..fab5946970d 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -379,7 +379,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
         float3 *shadow)
 {
 	bool blocked, is_transparent_isect;
-	if (skip_object == OBJECT_NONE) {
+	if(skip_object == OBJECT_NONE) {
 		blocked = scene_intersect(kg,
 		                          *ray,
 		                          PATH_RAY_SHADOW_OPAQUE,
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index f75e9337bdb..6475d4b66fd 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -140,7 +140,7 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N)
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
 	sd->randb_closure = 0.0f;
@@ -148,15 +148,35 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei
 	sd->num_closure_extra = 0;
 
 	if(hit) {
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
-		if(bsdf) {
-			bsdf->N = N;
-			sd->flag |= bsdf_diffuse_setup(bsdf);
-
-			/* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-			 * can recognize it as not being a regular diffuse closure */
-			bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		Bssrdf *bssrdf = (Bssrdf *)sc;
+#ifdef __PRINCIPLED__
+		if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+			PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+			if(bsdf) {
+				bsdf->N = N;
+				bsdf->roughness = bssrdf->roughness;
+				sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+				/* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+				 * can recognize it as not being a regular Disney principled diffuse closure */
+				bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+			}
+		}
+		else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) ||
+		        CLOSURE_IS_BSSRDF(bssrdf->type))
+#endif  /* __PRINCIPLED__ */
+		{
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+			if(bsdf) {
+				bsdf->N = N;
+				sd->flag |= bsdf_diffuse_setup(bsdf);
+
+				/* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+				 * can recognize it as not being a regular diffuse closure */
+				bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+			}
 		}
 	}
 }
@@ -379,6 +399,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 #else
 	Ray *ray = &ss_isect->ray;
 #endif
+
+	/* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+	kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
+
 	/* Setup new shading point. */
 	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
 
@@ -388,12 +414,11 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
 
 	/* Setup diffuse BSDF. */
-	subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
-#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -454,6 +479,10 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	if(ss_isect.num_hits > 0) {
 		float3 origP = sd->P;
 
+		/* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+		kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
 		/* setup new shading point */
 		shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
 
@@ -479,9 +508,8 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
 
 	/* setup diffuse bsdf */
-	subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
+	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
-#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index cb1a3f40dee..aa5b32803a5 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -82,10 +82,10 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions)
 #  if __CUDA_ARCH__ < 300
 /* full-float image */
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032)
 
 KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
 KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
@@ -93,91 +93,93 @@ KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
 KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
 KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
 
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
+/* image
+ * These texture names are encoded to their flattened slots as
+ * ImageManager::type_index_to_flattened_slot() returns them. */
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
 
 #  else
 /* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 623f3728c69..e6a62c42a38 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN
 #  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
 #endif
 
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+#  define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+#  define SHADER_SORT_LOCAL_SIZE 32
+#else
+#  define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -71,21 +83,18 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
+#  define __BRANCHED_PATH__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
+#  define __PRINCIPLED__
 #  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  ifndef __SPLIT_KERNEL__
-#    define __VOLUME_DECOUPLED__
-#    define __VOLUME_RECORD_ALL__
-#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
@@ -94,10 +103,11 @@ CCL_NAMESPACE_BEGIN
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SUBSURFACE__
+#  define __PRINCIPLED__
 #  define __SHADOW_RECORD_ALL__
+#  define __CMJ__
 #  ifndef __SPLIT_KERNEL__
 #    define __BRANCHED_PATH__
-#    define __CMJ__
 #  endif
 #endif  /* __KERNEL_CUDA__ */
 
@@ -109,43 +119,44 @@ CCL_NAMESPACE_BEGIN
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
 #    define __SUBSURFACE__
+#    define __PRINCIPLED__
 #    define __VOLUME__
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
+#    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_NVIDIA__ */
 
 #  ifdef __KERNEL_OPENCL_APPLE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __PRINCIPLED__
+#    define __CMJ__
 /* TODO(sergey): Currently experimental section is ignored here,
  * this is because megakernel in device_opencl does not support
  * custom cflags depending on the scene features.
  */
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
-#  endif  /* __KERNEL_OPENCL_NVIDIA__ */
+#  endif  /* __KERNEL_OPENCL_APPLE__ */
 
 #  ifdef __KERNEL_OPENCL_AMD__
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
 #    define __SUBSURFACE__
+#    define __PRINCIPLED__
 #    define __VOLUME__
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
+#    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
+#    define __PRINCIPLED__
+#    define __CMJ__
 #  endif  /* __KERNEL_OPENCL_INTEL_CPU__ */
 
 #endif  /* __KERNEL_OPENCL__ */
@@ -165,6 +176,8 @@ CCL_NAMESPACE_BEGIN
 #define __PATCH_EVAL__
 #define __SHADOW_TRICKS__
 
+#define __DENOISING_FEATURES__
+
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
 #  define __EMISSION__
@@ -220,7 +233,13 @@ CCL_NAMESPACE_BEGIN
 #  undef __TRANSPARENT_SHADOWS__
 #endif
 #ifdef __NO_SHADOW_TRICKS__
-#undef __SHADOW_TRICKS__
+#  undef __SHADOW_TRICKS__
+#endif
+#ifdef __NO_PRINCIPLED__
+#  undef __PRINCIPLED__
+#endif
+#ifdef __NO_DENOISING__
+#  undef __DENOISING_FEATURES__
 #endif
 
 /* Random Numbers */
@@ -303,31 +322,32 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-	PATH_RAY_CAMERA = 1,
-	PATH_RAY_REFLECT = 2,
-	PATH_RAY_TRANSMIT = 4,
-	PATH_RAY_DIFFUSE = 8,
-	PATH_RAY_GLOSSY = 16,
-	PATH_RAY_SINGULAR = 32,
-	PATH_RAY_TRANSPARENT = 64,
-
-	PATH_RAY_SHADOW_OPAQUE = 128,
-	PATH_RAY_SHADOW_TRANSPARENT = 256,
+	PATH_RAY_CAMERA              = (1 << 0),
+	PATH_RAY_REFLECT             = (1 << 1),
+	PATH_RAY_TRANSMIT            = (1 << 2),
+	PATH_RAY_DIFFUSE             = (1 << 3),
+	PATH_RAY_GLOSSY              = (1 << 4),
+	PATH_RAY_SINGULAR            = (1 << 5),
+	PATH_RAY_TRANSPARENT         = (1 << 6),
+
+	PATH_RAY_SHADOW_OPAQUE       = (1 << 7),
+	PATH_RAY_SHADOW_TRANSPARENT  = (1 << 8),
 	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
 
-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
-	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+	PATH_RAY_CURVE               = (1 << 9), /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER      = (1 << 10), /* volume scattering */
 
 	/* Special flag to tag unaligned BVH nodes. */
-	PATH_RAY_NODE_UNALIGNED = 2048,
+	PATH_RAY_NODE_UNALIGNED = (1 << 11),
 
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+	PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
 
-	PATH_RAY_MIS_SKIP = 4096,
-	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
-	PATH_RAY_SHADOW_CATCHER = 32768,
-	PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+	PATH_RAY_MIS_SKIP            = (1 << 12),
+	PATH_RAY_DIFFUSE_ANCESTOR    = (1 << 13),
+	PATH_RAY_SINGLE_PASS_DONE    = (1 << 14),
+	PATH_RAY_SHADOW_CATCHER      = (1 << 15),
+	PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+	PATH_RAY_STORE_SHADOW_INFO   = (1 << 17),
 };
 
 /* Closure Label */
@@ -383,6 +403,22 @@ typedef enum PassType {
 
 #define PASS_ALL (~0)
 
+typedef enum DenoisingPassOffsets {
+	DENOISING_PASS_NORMAL             = 0,
+	DENOISING_PASS_NORMAL_VAR         = 3,
+	DENOISING_PASS_ALBEDO             = 6,
+	DENOISING_PASS_ALBEDO_VAR         = 9,
+	DENOISING_PASS_DEPTH              = 12,
+	DENOISING_PASS_DEPTH_VAR          = 13,
+	DENOISING_PASS_SHADOW_A           = 14,
+	DENOISING_PASS_SHADOW_B           = 17,
+	DENOISING_PASS_COLOR              = 20,
+	DENOISING_PASS_COLOR_VAR          = 23,
+
+	DENOISING_PASS_SIZE_BASE          = 26,
+	DENOISING_PASS_SIZE_CLEAN         = 3,
+} DenoisingPassOffsets;
+
 typedef enum BakePassFilter {
 	BAKE_FILTER_NONE = 0,
 	BAKE_FILTER_DIRECT = (1 << 0),
@@ -416,6 +452,18 @@ typedef enum BakePassFilterCombos {
 	BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;
 
+typedef enum DenoiseFlag {
+	DENOISING_CLEAN_DIFFUSE_DIR      = (1 << 0),
+	DENOISING_CLEAN_DIFFUSE_IND      = (1 << 1),
+	DENOISING_CLEAN_GLOSSY_DIR       = (1 << 2),
+	DENOISING_CLEAN_GLOSSY_IND       = (1 << 3),
+	DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+	DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+	DENOISING_CLEAN_SUBSURFACE_DIR   = (1 << 6),
+	DENOISING_CLEAN_SUBSURFACE_IND   = (1 << 7),
+	DENOISING_CLEAN_ALL_PASSES       = (1 << 8)-1,
+} DenoiseFlag;
+
 typedef ccl_addr_space struct PathRadiance {
 #ifdef __PASSES__
 	int use_light_pass;
@@ -469,8 +517,20 @@ typedef ccl_addr_space struct PathRadiance {
 	float3 path_total_shaded;
 
 	/* Color of the background on which shadow is alpha-overed. */
-	float3 shadow_color;
+	float3 shadow_background_color;
+
+	/* Path radiance sum and throughput at the moment when ray hits shadow
+	 * catcher object.
+	 */
+	float3 shadow_radiance_sum;
+	float shadow_throughput;
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	float3 denoising_normal;
+	float3 denoising_albedo;
+	float denoising_depth;
+#endif  /* __DENOISING_FEATURES__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -713,12 +773,13 @@ typedef struct AttributeDescriptor {
 #define SHADER_CLOSURE_BASE \
 	float3 weight; \
 	ClosureType type; \
-	float sample_weight \
+	float sample_weight; \
+	float3 N
 
 typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
 	SHADER_CLOSURE_BASE;
 
-	float data[14]; /* pad to 80 bytes */
+	float data[10]; /* pad to 80 bytes */
 } ShaderClosure;
 
 /* Shader Context
@@ -949,6 +1010,10 @@ typedef struct PathState {
 	int transmission_bounce;
 	int transparent_bounce;
 
+#ifdef __DENOISING_FEATURES__
+	float denoising_feature_weight;
+#endif  /* __DENOISING_FEATURES__ */
+
 	/* multiple importance sampling */
 	float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
 	float ray_pdf;     /* last bounce pdf */
@@ -1126,6 +1191,11 @@ typedef struct KernelFilm {
 	float mist_inv_depth;
 	float mist_falloff;
 
+	int pass_denoising_data;
+	int pass_denoising_clean;
+	int denoising_flags;
+	int pad;
+
 #ifdef __KERNEL_DEBUG__
 	int pass_bvh_traversed_nodes;
 	int pass_bvh_traversed_instances;
@@ -1298,7 +1368,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1311,22 +1380,42 @@ enum QueueNumber {
 	 * 3. Rays to be regenerated
 	 * are enqueued here.
 	 */
-	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contribution for AO are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contributing for direct lighting are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+	/* Rays sorted according to shader->id */
+	QUEUE_SHADER_SORTED_RAYS,
+
+#ifdef __BRANCHED_PATH__
+	/* All rays moving to next iteration of the indirect loop for light */
+	QUEUE_LIGHT_INDIRECT_ITER,
+	/* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+	QUEUE_INACTIVE_RAYS,
+#  ifdef __VOLUME__
+	/* All rays moving to next iteration of the indirect loop for volumes */
+	QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+	/* All rays moving to next iteration of the indirect loop for subsurface */
+	QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+	NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
 	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
@@ -1341,14 +1430,25 @@ enum RayState {
 	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
 	RAY_REGENERATED,
-	/* Flag's ray has to execute shadow blocked function in AO part */
-	RAY_SHADOW_RAY_CAST_AO = 16,
-	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
-	RAY_SHADOW_RAY_CAST_DL = 32,
+	/* Denotes ray is moving to next iteration of the branched indirect loop */
+	RAY_LIGHT_INDIRECT_NEXT_ITER,
+	RAY_VOLUME_INDIRECT_NEXT_ITER,
+	RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+	/* Ray flags */
+
+	/* Flags to denote that the ray is currently evaluating the branched indirect loop */
+	RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+	/* Ray is evaluating an iteration of an indirect loop for another thread */
+	RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 9c0878249d4..1e472aaf51a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -660,6 +660,7 @@ typedef struct VolumeSegment {
  * but the entire segment is needed to do always scattering, rather than probabilistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
  * volume we can't generate stratified distance samples up to that transmittance */
+#ifdef __VOLUME_DECOUPLED__
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
@@ -829,6 +830,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
 #endif
 	}
 }
+#endif  /* __VOLUME_DECOUPLED__ */
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
  * marching.
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..2ed713299fd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleV,
+                                                     float *sampleVV,
+                                                     float *bufferV,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean,
+                                                   float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weight_image,
+                                                           float *variance,
+                                                           float *difference_image,
+                                                           int* rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
+                                                int* rect,
+                                                int w,
+                                                int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
+                                                       int* rect,
+                                                       int w,
+                                                       int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *difference_image,
+                                                         float *image,
+                                                         float *out_image,
+                                                         float *accum_image,
+                                                         int* rect,
+                                                         int w,
+                                                         int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *difference_image,
+                                                             float *buffer,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
+                                                     int* rect,
+                                                     int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..8dc1a8d583c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleVariance,
+                                                     float *sampleVarianceV,
+                                                     float *bufferVariance,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+	kernel_filter_divide_shadow(sample, tiles,
+	                            x, y,
+	                            unfilteredA,
+	                            unfilteredB,
+	                            sampleVariance,
+	                            sampleVarianceV,
+	                            bufferVariance,
+	                            load_int4(prefilter_rect),
+	                            buffer_pass_stride,
+	                            buffer_denoising_offset,
+	                            use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean, float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+	kernel_filter_get_feature(sample, tiles,
+	                          m_offset, v_offset,
+	                          x, y,
+	                          mean, variance,
+	                          load_int4(prefilter_rect),
+	                          buffer_pass_stride,
+	                          buffer_denoising_offset,
+	                          use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
+#else
+	kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* prefilter_rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+  rank += storage_ofs;
+  transform += storage_ofs*TRANSFORM_SIZE;
+	kernel_filter_construct_transform(buffer,
+	                                  x, y,
+	                                  load_int4(prefilter_rect),
+	                                  pass_stride,
+	                                  transform,
+	                                  rank,
+	                                  radius,
+	                                  pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weight_image,
+                                                           float *variance,
+                                                           float *difference_image,
+                                                           int *rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+	kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
+                                                int *rect,
+                                                int w,
+                                                int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+	kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
+                                                       int *rect,
+                                                       int w,
+                                                       int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+	kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *difference_image,
+                                                         float *image,
+                                                         float *out_image,
+                                                         float *accum_image,
+                                                         int *rect,
+                                                         int w,
+                                                         int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *difference_image,
+                                                             float *buffer,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
+                                                     int *rect,
+                                                     int w)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+	kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+	XtWX += storage_ofs*XTWX_SIZE;
+	XtWY += storage_ofs*XTWY_SIZE;
+	rank += storage_ofs;
+	kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 16992c681e6..998619ac897 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -95,9 +95,12 @@ void kernel_tex_copy(KernelGlobals *kg,
 	else if(strstr(name, "__tex_image_float4")) {
 		texture_image_float4 *tex = NULL;
 		int id = atoi(name + strlen("__tex_image_float4_"));
-		int array_index = id;
+		int array_index = kernel_tex_index(id);
 
-		if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) {
+		if(array_index >= 0) {
+			if(array_index >= kg->texture_float4_images.size()) {
+				kg->texture_float4_images.resize(array_index+1);
+			}
 			tex = &kg->texture_float4_images[array_index];
 		}
 
@@ -111,9 +114,12 @@ void kernel_tex_copy(KernelGlobals *kg,
 	else if(strstr(name, "__tex_image_float")) {
 		texture_image_float *tex = NULL;
 		int id = atoi(name + strlen("__tex_image_float_"));
-		int array_index = id - TEX_START_FLOAT_CPU;
+		int array_index = kernel_tex_index(id);
 
-		if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) {
+		if(array_index >= 0) {
+			if(array_index >= kg->texture_float_images.size()) {
+				kg->texture_float_images.resize(array_index+1);
+			}
 			tex = &kg->texture_float_images[array_index];
 		}
 
@@ -127,9 +133,12 @@ void kernel_tex_copy(KernelGlobals *kg,
 	else if(strstr(name, "__tex_image_byte4")) {
 		texture_image_uchar4 *tex = NULL;
 		int id = atoi(name + strlen("__tex_image_byte4_"));
-		int array_index = id - TEX_START_BYTE4_CPU;
+		int array_index = kernel_tex_index(id);
 
-		if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) {
+		if(array_index >= 0) {
+			if(array_index >= kg->texture_byte4_images.size()) {
+				kg->texture_byte4_images.resize(array_index+1);
+			}
 			tex = &kg->texture_byte4_images[array_index];
 		}
 
@@ -143,9 +152,12 @@ void kernel_tex_copy(KernelGlobals *kg,
 	else if(strstr(name, "__tex_image_byte")) {
 		texture_image_uchar *tex = NULL;
 		int id = atoi(name + strlen("__tex_image_byte_"));
-		int array_index = id - TEX_START_BYTE_CPU;
+		int array_index = kernel_tex_index(id);
 
-		if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) {
+		if(array_index >= 0) {
+			if(array_index >= kg->texture_byte_images.size()) {
+				kg->texture_byte_images.resize(array_index+1);
+			}
 			tex = &kg->texture_byte_images[array_index];
 		}
 
@@ -159,9 +171,12 @@ void kernel_tex_copy(KernelGlobals *kg,
 	else if(strstr(name, "__tex_image_half4")) {
 		texture_image_half4 *tex = NULL;
 		int id = atoi(name + strlen("__tex_image_half4_"));
-		int array_index = id - TEX_START_HALF4_CPU;
+		int array_index = kernel_tex_index(id);
 
-		if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) {
+		if(array_index >= 0) {
+			if(array_index >= kg->texture_half4_images.size()) {
+				kg->texture_half4_images.resize(array_index+1);
+			}
 			tex = &kg->texture_half4_images[array_index];
 		}
 
@@ -175,9 +190,12 @@ void kernel_tex_copy(KernelGlobals *kg,
 	else if(strstr(name, "__tex_image_half")) {
 		texture_image_half *tex = NULL;
 		int id = atoi(name + strlen("__tex_image_half_"));
-		int array_index = id - TEX_START_HALF_CPU;
+		int array_index = kernel_tex_index(id);
 
-		if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) {
+		if(array_index >= 0) {
+			if(array_index >= kg->texture_half_images.size()) {
+				kg->texture_half_images.resize(array_index+1);
+			}
 			tex = &kg->texture_half_images[array_index];
 		}
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 896b80d783e..c8938534fe8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -77,16 +77,17 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
 DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
 DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
 
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index af68907a5c2..f6bb4c25012 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -23,51 +23,59 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y);
-	else
-		return kg->texture_float4_images[tex].interp(x, y);
+	switch(kernel_tex_type(tex)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y);
+		case IMAGE_DATA_TYPE_BYTE:
+			return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y);
+		case IMAGE_DATA_TYPE_HALF4:
+			return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y);
+	}
 }
 
 ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z);
-	else
-		return kg->texture_float4_images[tex].interp_3d(x, y, z);
-
+	switch(kernel_tex_type(tex)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+		case IMAGE_DATA_TYPE_BYTE:
+			return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+		case IMAGE_DATA_TYPE_HALF4:
+			return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z);
+	}
 }
 
 ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation);
-	else
-		return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation);
+	switch(kernel_tex_type(tex)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+		case IMAGE_DATA_TYPE_BYTE:
+			return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+		case IMAGE_DATA_TYPE_HALF4:
+			return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 148b2eef568..d4315ee5ec4 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -22,38 +22,50 @@
 
 #include "kernel/kernel_compat_cpu.h"
 
-#ifndef __SPLIT_KERNEL__
-#  include "kernel/kernel_math.h"
-#  include "kernel/kernel_types.h"
-
-#  include "kernel/split/kernel_split_data.h"
-#  include "kernel/kernel_globals.h"
-
-#  include "kernel/kernels/cpu/kernel_cpu_image.h"
-#  include "kernel/kernel_film.h"
-#  include "kernel/kernel_path.h"
-#  include "kernel/kernel_path_branched.h"
-#  include "kernel/kernel_bake.h"
+#ifndef KERNEL_STUB
+#  ifndef __SPLIT_KERNEL__
+#    include "kernel/kernel_math.h"
+#    include "kernel/kernel_types.h"
+
+#    include "kernel/split/kernel_split_data.h"
+#    include "kernel/kernel_globals.h"
+
+#    include "kernel/kernels/cpu/kernel_cpu_image.h"
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_path.h"
+#    include "kernel/kernel_path_branched.h"
+#    include "kernel/kernel_bake.h"
+#  else
+#    include "kernel/split/kernel_split_common.h"
+
+#    include "kernel/split/kernel_data_init.h"
+#    include "kernel/split/kernel_path_init.h"
+#    include "kernel/split/kernel_scene_intersect.h"
+#    include "kernel/split/kernel_lamp_emission.h"
+#    include "kernel/split/kernel_do_volume.h"
+#    include "kernel/split/kernel_queue_enqueue.h"
+#    include "kernel/split/kernel_indirect_background.h"
+#    include "kernel/split/kernel_shader_setup.h"
+#    include "kernel/split/kernel_shader_sort.h"
+#    include "kernel/split/kernel_shader_eval.h"
+#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#    include "kernel/split/kernel_subsurface_scatter.h"
+#    include "kernel/split/kernel_direct_lighting.h"
+#    include "kernel/split/kernel_shadow_blocked_ao.h"
+#    include "kernel/split/kernel_shadow_blocked_dl.h"
+#    include "kernel/split/kernel_enqueue_inactive.h"
+#    include "kernel/split/kernel_next_iteration_setup.h"
+#    include "kernel/split/kernel_indirect_subsurface.h"
+#    include "kernel/split/kernel_buffer_update.h"
+#  endif  /* __SPLIT_KERNEL__ */
 #else
-#  include "kernel/split/kernel_split_common.h"
-
-#  include "kernel/split/kernel_data_init.h"
-#  include "kernel/split/kernel_path_init.h"
-#  include "kernel/split/kernel_scene_intersect.h"
-#  include "kernel/split/kernel_lamp_emission.h"
-#  include "kernel/split/kernel_do_volume.h"
-#  include "kernel/split/kernel_queue_enqueue.h"
-#  include "kernel/split/kernel_indirect_background.h"
-#  include "kernel/split/kernel_shader_eval.h"
-#  include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#  include "kernel/split/kernel_subsurface_scatter.h"
-#  include "kernel/split/kernel_direct_lighting.h"
-#  include "kernel/split/kernel_shadow_blocked_ao.h"
-#  include "kernel/split/kernel_shadow_blocked_dl.h"
-#  include "kernel/split/kernel_next_iteration_setup.h"
-#  include "kernel/split/kernel_indirect_subsurface.h"
-#  include "kernel/split/kernel_buffer_update.h"
-#endif
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+
+#  ifdef __SPLIT_KERNEL__
+#    include "kernel/split/kernel_data_init.h"
+#  endif  /* __SPLIT_KERNEL__ */
+#endif  /* KERNEL_STUB */
 
 CCL_NAMESPACE_BEGIN
 
@@ -69,7 +81,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            int offset,
                                            int stride)
 {
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+#  ifdef __BRANCHED_PATH__
 	if(kernel_data.integrator.branched) {
 		kernel_branched_path_trace(kg,
 		                           buffer,
@@ -80,10 +95,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 		                           stride);
 	}
 	else
-#endif
+#  endif
 	{
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 	}
+#endif /* KERNEL_STUB */
 }
 
 /* Film */
@@ -96,6 +112,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
                                                 int offset,
                                                 int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
 	kernel_film_convert_to_byte(kg,
 	                            rgba,
 	                            buffer,
@@ -103,6 +122,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
+#endif /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -113,6 +133,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
                                                       int offset,
                                                       int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
 	kernel_film_convert_to_half_float(kg,
 	                                  rgba,
 	                                  buffer,
@@ -120,6 +143,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
+#endif /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -134,9 +158,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, shader);
+#else
 	if(type >= SHADER_EVAL_BAKE) {
 		kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+#  ifdef __BAKING__
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -145,7 +172,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                     i,
 		                     offset,
 		                     sample);
-#endif
+#  endif
 	}
 	else {
 		kernel_shader_evaluate(kg,
@@ -156,24 +183,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                       i,
 		                       sample);
 	}
+#endif /* KERNEL_STUB */
 }
 
 #else  /* __SPLIT_KERNEL__ */
 
 /* Split Kernel Path Tracing */
 
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+#else
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		kernel_##name(kg); \
 	}
 
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		ccl_local type locals; \
 		kernel_##name(kg, &locals); \
 	}
+#endif /* KERNEL_STUB */
 
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
@@ -181,49 +223,22 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
-	REGISTER(path_trace);
-	REGISTER(convert_to_byte);
-	REGISTER(convert_to_half_float);
-	REGISTER(shader);
-
-	REGISTER(data_init);
-	REGISTER(path_init);
-	REGISTER(scene_intersect);
-	REGISTER(lamp_emission);
-	REGISTER(do_volume);
-	REGISTER(queue_enqueue);
-	REGISTER(indirect_background);
-	REGISTER(shader_eval);
-	REGISTER(holdout_emission_blurring_pathtermination_ao);
-	REGISTER(subsurface_scatter);
-	REGISTER(direct_lighting);
-	REGISTER(shadow_blocked_ao);
-	REGISTER(shadow_blocked_dl);
-	REGISTER(next_iteration_setup);
-	REGISTER(indirect_subsurface);
-	REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
 #endif  /* __SPLIT_KERNEL__ */
 
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..009c3fde9d5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+                                 TilesInfo *tiles,
+                                 float *unfilteredA,
+                                 float *unfilteredB,
+                                 float *sampleVariance,
+                                 float *sampleVarianceV,
+                                 float *bufferVariance,
+                                 int4 prefilter_rect,
+                                 int buffer_pass_stride,
+                                 int buffer_denoising_offset,
+                                 bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+                               TilesInfo *tiles,
+                               int m_offset,
+                               int v_offset,
+                               float *mean,
+                               float *variance,
+                               int4 prefilter_rect,
+                               int buffer_pass_stride,
+                               int buffer_denoising_offset,
+                               bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_detect_outliers(float *image,
+                                   float *variance,
+                                   float *depth,
+                                   float *output,
+                                   int4 prefilter_rect,
+                                   int pass_stride)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       float *transform, int *rank,
+                                       int4 filter_area, int4 rect,
+                                       int radius, float pca_threshold,
+                                       int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int *l_rank = rank + y*filter_area.z + x;
+		float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+                                       const float *ccl_restrict weight_image,
+                                       const float *ccl_restrict variance_image,
+                                       float *difference_image,
+                                       int4 rect, int w,
+                                       int channel_offset,
+                                       float a, float k_2)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+                                     const float *ccl_restrict difference_image,
+                                     const float *ccl_restrict image,
+                                     float *out_image, float *accum_image,
+                                     int4 rect, int w,
+                                     int f)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+                                         const float *ccl_restrict difference_image,
+                                         const float *ccl_restrict buffer,
+                                         float const* __restrict__ transform,
+                                         int *rank,
+                                         float *XtWX,
+                                         float3 *XtWY,
+                                         int4 rect,
+                                         int4 filter_rect,
+                                         int w, int h, int f,
+                                         int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    difference_image,
+		                                    buffer,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+                            float *buffer, int *rank,
+                            float *XtWX, float3 *XtWY,
+                            int4 filter_area, int4 buffer_params,
+                            int sample)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index a679eff8409..628891b1458 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -31,12 +31,15 @@
 #include "kernel/split/kernel_do_volume.h"
 #include "kernel/split/kernel_queue_enqueue.h"
 #include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
 #include "kernel/split/kernel_shader_eval.h"
 #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 #include "kernel/split/kernel_subsurface_scatter.h"
 #include "kernel/split/kernel_direct_lighting.h"
 #include "kernel/split/kernel_shadow_blocked_ao.h"
 #include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
 #include "kernel/split/kernel_next_iteration_setup.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 #include "kernel/split/kernel_buffer_update.h"
@@ -108,12 +111,15 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..ba53ba4b26f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+                                              ccl_global TilesInfo *tiles,
+                                              ccl_global float *unfilteredA,
+                                              ccl_global float *unfilteredB,
+                                              ccl_global float *sampleVariance,
+                                              ccl_global float *sampleVarianceV,
+                                              ccl_global float *bufferVariance,
+                                              int4 prefilter_rect,
+                                              int buffer_pass_stride,
+                                              int buffer_denoising_offset,
+                                              char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int m_offset,
+                                            int v_offset,
+                                            ccl_global float *mean,
+                                            ccl_global float *variance,
+                                            int4 prefilter_rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
+                                                ccl_global float *variance,
+                                                ccl_global float *depth,
+                                                ccl_global float *output,
+                                                int4 prefilter_rect,
+                                                int pass_stride)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+                                               ccl_global float *variance,
+                                               ccl_global float *a,
+                                               ccl_global float *b,
+                                               int4 prefilter_rect,
+                                               int r)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                    ccl_global float *transform,
+                                                    ccl_global int *rank,
+                                                    int4 filter_area,
+                                                    int4 rect,
+                                                    int pass_stride,
+                                                    int radius,
+                                                    float pca_threshold)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		ccl_global int *l_rank = rank + y*filter_area.z + x;
+		ccl_global float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+                                                    int dy,
+                                                    const ccl_global float *ccl_restrict weight_image,
+                                                    const ccl_global float *ccl_restrict variance_image,
+                                                    ccl_global float *difference_image,
+                                                    int4 rect,
+                                                    int w,
+                                                    int channel_offset,
+                                                    float a,
+                                                    float k_2)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
+                                         ccl_global float *out_image,
+                                         int4 rect,
+                                         int w,
+                                         int f)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
+                                                ccl_global float *out_image,
+                                                int4 rect,
+                                                int w,
+                                                int f)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+                                                  int dy,
+                                                  const ccl_global float *ccl_restrict difference_image,
+                                                  const ccl_global float *ccl_restrict image,
+                                                  ccl_global float *out_image,
+                                                  ccl_global float *accum_image,
+                                                  int4 rect,
+                                                  int w,
+                                                  int f)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
+                                              const ccl_global float *ccl_restrict accum_image,
+                                              int4 rect,
+                                              int w)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+                                                      int dy,
+                                                      const ccl_global float *ccl_restrict difference_image,
+                                                      const ccl_global float *ccl_restrict buffer,
+                                                      const ccl_global float *ccl_restrict transform,
+                                                      ccl_global int *rank,
+                                                      ccl_global float *XtWX,
+                                                      ccl_global float3 *XtWY,
+                                                      int4 rect,
+                                                      int4 filter_rect,
+                                                      int w,
+                                                      int h,
+                                                      int f,
+                                                      int pass_stride)
+{
+	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    difference_image,
+		                                    buffer,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+	                                     int h,
+                                         ccl_global float *buffer,
+                                         ccl_global int *rank,
+                                         ccl_global float *XtWX,
+                                         ccl_global float3 *XtWY,
+                                         int4 filter_area,
+                                         int4 buffer_params,
+                                         int sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+                                          ccl_global float *buffer_1,
+                                          ccl_global float *buffer_2,
+                                          ccl_global float *buffer_3,
+                                          ccl_global float *buffer_4,
+                                          ccl_global float *buffer_5,
+                                          ccl_global float *buffer_6,
+                                          ccl_global float *buffer_7,
+                                          ccl_global float *buffer_8,
+                                          ccl_global float *buffer_9)
+{
+	if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+		tiles->buffers[0] = buffer_1;
+		tiles->buffers[1] = buffer_2;
+		tiles->buffers[2] = buffer_3;
+		tiles->buffers[3] = buffer_4;
+		tiles->buffers[4] = buffer_5;
+		tiles->buffers[5] = buffer_6;
+		tiles->buffers[6] = buffer_7;
+		tiles->buffers[7] = buffer_8;
+		tiles->buffers[8] = buffer_9;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
index db65c91baf7..dcea2630aef 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_buffer_update.h"
 
-__kernel void kernel_ocl_path_trace_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index eb34f750881..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_direct_lighting.h"
 
-__kernel void kernel_ocl_path_trace_direct_lighting(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
index 83ef5f5f3f2..8afaa686e28 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_do_volume.h"
 
-__kernel void kernel_ocl_path_trace_do_volume(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_do_volume((KernelGlobals*)kg);
-}
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index d071b39aa6f..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -18,12 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local BackgroundAOLocals locals;
-	kernel_holdout_emission_blurring_pathtermination_ao(
-	        (KernelGlobals*)kg,
-	        &locals);
-}
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
index 8c213ff5cb2..192d01444ba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_indirect_background.h"
 
-__kernel void kernel_ocl_path_trace_indirect_background(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_indirect_background((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
index 998ebc4c0c3..84938b889e5 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 
-__kernel void kernel_ocl_path_trace_indirect_subsurface(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_indirect_subsurface((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 822d2287715..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_lamp_emission.h"
 
-__kernel void kernel_ocl_path_trace_lamp_emission(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_lamp_emission((KernelGlobals*)kg);
-}
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d207253a40..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_next_iteration_setup.h"
 
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
index bd9aa9538c8..fa210e747c0 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_path_init.h"
 
-__kernel void kernel_ocl_path_trace_path_init(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_path_init((KernelGlobals*)kg);
-}
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 9be154e3d75..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_queue_enqueue.h"
 
-__kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local QueueEnqueueLocals locals;
-	kernel_queue_enqueue((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index eb4fb4d153a..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_scene_intersect.h"
 
-__kernel void kernel_ocl_path_trace_scene_intersect(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_scene_intersect((KernelGlobals*)kg);
-}
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 6baee460986..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -18,10 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shader_eval.h"
 
-__kernel void kernel_ocl_path_trace_shader_eval(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
new file mode 100644
index 00000000000..8c36100f762
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_setup.h"
+
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
new file mode 100644
index 00000000000..bcacaa4a054
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_sort.h"
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
index 6a8ef81b32a..8de250a375c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shadow_blocked_ao.h"
 
-__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_shadow_blocked_ao((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
index b255cc5ef8b..29da77022ed 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shadow_blocked_dl.h"
 
-__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_shadow_blocked_dl((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 732cda30115..651addb02f4 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -23,12 +23,15 @@
 #include "kernel/kernels/opencl/kernel_do_volume.cl"
 #include "kernel/kernels/opencl/kernel_indirect_background.cl"
 #include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_setup.cl"
+#include "kernel/kernels/opencl/kernel_shader_sort.cl"
 #include "kernel/kernels/opencl/kernel_shader_eval.cl"
 #include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
 #include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
 #include "kernel/kernels/opencl/kernel_direct_lighting.cl"
 #include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
 #include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
 #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
 #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
 #include "kernel/kernels/opencl/kernel_buffer_update.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..f1e914a70d4
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+		ccl_global char *kg_global,
+		ccl_constant KernelData *data,
+
+		ccl_global void *split_data_buffer,
+		ccl_global char *ray_state,
+		ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+		ccl_global type *name,
+#include "kernel/kernel_textures.h"
+
+		ccl_global int *queue_index,
+		ccl_global char *use_queues_flag,
+		ccl_global unsigned int *work_pools,
+		ccl_global float *buffer
+	)
+{
+#ifdef LOCALS_TYPE
+	ccl_local LOCALS_TYPE locals;
+#endif
+
+	KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+	if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+		kg->data = data;
+
+		kernel_split_params.rng_state = rng_state;
+		kernel_split_params.queue_index = queue_index;
+		kernel_split_params.use_queues_flag = use_queues_flag;
+		kernel_split_params.work_pools = work_pools;
+		kernel_split_params.buffer = buffer;
+
+		split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+		kg->name = name;
+#include "kernel/kernel_textures.h"
+	}
+
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+			kg
+#ifdef LOCALS_TYPE
+			, &locals
+#endif
+		);
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
index 7a1838e485f..2b3be38df84 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -18,10 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_subsurface_scatter.h"
 
-__kernel void kernel_ocl_path_trace_subsurface_scatter(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 95beea01d25..27a96720c1e 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -39,7 +39,9 @@
 #include "kernel/kernel_montecarlo.h"
 
 #include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
 #include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
 #include "kernel/closure/bssrdf.h"
 
 CCL_NAMESPACE_BEGIN
@@ -78,6 +80,7 @@ public:
 				bssrdf->albedo = albedo.x;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
+				bssrdf->roughness = params.roughness;
 				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
@@ -89,6 +92,7 @@ public:
 				bssrdf->albedo = albedo.y;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
+				bssrdf->roughness = params.roughness;
 				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
@@ -100,6 +104,7 @@ public:
 				bssrdf->albedo = albedo.z;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
+				bssrdf->roughness = params.roughness;
 				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 		}
@@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params()
 
 CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure)
 
+/* Disney principled */
+
+class PrincipledBSSRDFClosure : public CBSSRDFClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+	}
+};
+
+ClosureParam *closure_bssrdf_principled_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius),
+		CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur),
+		CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo),
+		CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness),
+		CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure)
+	};
+	return params;
+}
+
+CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure)
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index f44714c2150..14c5c1c3db5 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -60,6 +60,8 @@
 #include "kernel/closure/bsdf_ashikhmin_shirley.h"
 #include "kernel/closure/bsdf_toon.h"
 #include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
 #include "kernel/closure/volume.h"
 
 CCL_NAMESPACE_BEGIN
@@ -154,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -162,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -176,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein)
 VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR)
 VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption)
 
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N),
+	CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness),
+BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
+
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
+BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
+
+/* DISNEY PRINCIPLED CLEARCOAT */
+class PrincipledClearcoatClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float clearcoat, clearcoat_roughness;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+		MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+		if(bsdf && extra) {
+			bsdf->extra = extra;
+
+			bsdf->ior = 1.5f;
+
+			bsdf->alpha_x = clearcoat_roughness;
+			bsdf->alpha_y = clearcoat_roughness;
+
+			bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+			bsdf->extra->clearcoat = clearcoat;
+
+			return bsdf;
+		}
+
+		return NULL;
+	}
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_principled_clearcoat_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
+		CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure)
+
+
 /* Registration */
 
 static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare)
@@ -215,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare);
 	register_closure(ss, "microfacet_multi_ggx_aniso", id++,
 		closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare);
+	register_closure(ss, "microfacet_ggx_fresnel", id++,
+		closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare);
+	register_closure(ss, "microfacet_ggx_aniso_fresnel", id++,
+		closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare);
 	register_closure(ss, "microfacet_beckmann", id++,
 		bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
 	register_closure(ss, "microfacet_beckmann_aniso", id++,
@@ -229,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
+	register_closure(ss, "principled_diffuse", id++,
+		bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare);
+	register_closure(ss, "principled_sheen", id++,
+		bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare);
+	register_closure(ss, "principled_clearcoat", id++,
+		closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare);
 
 	register_closure(ss, "emission", id++,
 		closure_emission_params(), closure_emission_prepare);
@@ -248,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
 	register_closure(ss, "bssrdf_burley", id++,
 		closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare);
+	register_closure(ss, "bssrdf_principled", id++,
+		closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare);
 
 	register_closure(ss, "hair_reflection", id++,
 		bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
@@ -278,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 	return false;
 }
 
+
+/* GGX closures with Fresnel */
+
+class MicrofacetFresnelClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
+	float3 cspec0;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		* since this is set statically and only used for caustic flags, this
+		* is probably as good as it gets. */
+		if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(bsdf && extra) {
+				bsdf->extra = extra;
+				bsdf->extra->color = color;
+				bsdf->extra->cspec0 = cspec0;
+				return bsdf;
+			}
+		}
+
+		return NULL;
+	}
+};
+
+class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure);
+
+class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure);
+
+
 /* Multiscattering GGX closures */
 
 class MicrofacetMultiClosure : public CBSDFClosure {
@@ -287,7 +444,7 @@ public:
 
 	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
 	{
-		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		/* Technically, the MultiGGX closure may also transmit. However,
 		 * since this is set statically and only used for caustic flags, this
 		 * is probably as good as it gets. */
 	    if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
@@ -375,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
 }
 CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure);
 
+
+/* Multiscattering GGX closures with Fresnel */
+
+class MicrofacetMultiFresnelClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
+	float3 cspec0;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		/* Technically, the MultiGGX closure may also transmit. However,
+		* since this is set statically and only used for caustic flags, this
+		* is probably as good as it gets. */
+		if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(bsdf && extra) {
+				bsdf->extra = extra;
+				bsdf->extra->color = color;
+				bsdf->extra->cspec0 = cspec0;
+				return bsdf;
+			}
+		}
+
+		return NULL;
+	}
+};
+
+class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure);
+
+class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure);
+
+class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {}
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure);
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 929cf00a7e6..ff5fd9cc905 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params();
 OSL::ClosureParam *closure_bssrdf_cubic_params();
 OSL::ClosureParam *closure_bssrdf_gaussian_params();
 OSL::ClosureParam *closure_bssrdf_burley_params();
+OSL::ClosureParam *closure_bssrdf_principled_params();
 OSL::ClosureParam *closure_henyey_greenstein_volume_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_principled_clearcoat_params();
 
 void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
 void closure_background_prepare(OSL::RendererServices *, int id, void *data);
@@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data
 void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data);
 
 #define CCLOSURE_PREPARE(name, classname)          \
 void name(RendererServices *, int id, void *data) \
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index b767c60c617..1535496c73d 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -824,7 +824,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if(sg->renderstate == NULL)
+	if(sg == NULL || sg->renderstate == NULL)
 		return false;
 
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index b43f8402d42..1a8ed4c884a 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -81,13 +81,15 @@ set(SRC_OSL
 	node_wireframe.osl
 	node_hair_bsdf.osl
 	node_uv_map.osl
+	node_principled_bsdf.osl
 	node_rgb_to_bw.osl
 )
 
 set(SRC_OSL_HEADERS
-	node_texture.h
 	node_color.h
 	node_fresnel.h
+	node_ramp_util.h
+	node_texture.h
 	stdosl.h
 	oslutil.h
 )
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
new file mode 100644
index 00000000000..6870d479af3
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+#include "node_fresnel.h"
+
+shader node_principled_bsdf(
+	string distribution = "Multiscatter GGX",
+	color BaseColor = color(0.8, 0.8, 0.8),
+	float Subsurface = 0.0,
+	vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
+	color SubsurfaceColor = color(0.7, 0.1, 0.1),
+	float Metallic = 0.0,
+	float Specular = 0.5,
+	float SpecularTint = 0.0,
+	float Roughness = 0.5,
+	float Anisotropic = 0.0,
+	float AnisotropicRotation = 0.0,
+	float Sheen = 0.0,
+	float SheenTint = 0.5,
+	float Clearcoat = 0.0,
+	float ClearcoatRoughness = 0.03,
+	float IOR = 1.45,
+	float Transmission = 0.0,
+	float TransmissionRoughness = 0.0,
+	normal Normal = N,
+	normal ClearcoatNormal = N,
+	normal Tangent = normalize(dPdu),
+	output closure color BSDF = 0)
+{
+	float f = max(IOR, 1e-5);
+	float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0));
+	float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
+	float specular_weight = (1.0 - final_transmission);
+
+	vector T = Tangent;
+
+	float m_cdlum = luminance(BaseColor);
+	color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat
+
+	/* rotate tangent */
+	if (AnisotropicRotation != 0.0)
+		T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
+
+	if (diffuse_weight > 1e-5) {
+		if (Subsurface > 1e-5) {
+			color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
+			BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness);
+		} else {
+			BSDF = BaseColor * principled_diffuse(Normal, Roughness);
+		}
+
+		if (Sheen > 1e-5) {
+			color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint;
+
+			BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal);
+		}
+
+		BSDF = BSDF * diffuse_weight;
+	}
+
+	if (specular_weight > 1e-5) {
+		float aspect = sqrt(1.0 - Anisotropic * 0.9);
+		float r2 = Roughness * Roughness;
+
+		float alpha_x = r2 / aspect;
+		float alpha_y = r2 * aspect;
+
+		color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
+
+		color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic;
+
+		if (distribution == "GGX" || Roughness <= 0.075) {
+			BSDF = BSDF  + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+		} else {
+			BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+		}
+	}
+
+	if (final_transmission > 1e-5) {
+		color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint);
+		float eta = backfacing() ? 1.0 / f : f;
+
+		if (distribution == "GGX" || Roughness <= 5e-2) {
+			float cosNO = dot(Normal, I);
+			float Fr = fresnel_dielectric_cos(cosNO, eta);
+
+			float refl_roughness = Roughness;
+			if (Roughness <= 1e-2)
+				refl_roughness = 0.0;
+
+			float transmission_roughness = refl_roughness;
+			if (distribution == "GGX")
+				transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness);
+
+			BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
+			       (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta));
+		} else {
+			BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
+		}
+	}
+
+	if (Clearcoat > 1e-5) {
+		BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness);
+	}
+}
+
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index a8dda8a12c9..c91d2918687 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
 closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
 closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
 closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
 closure color microfacet_beckmann(normal N, float ab) BUILTIN;
 closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
 closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
@@ -539,11 +544,15 @@ closure color emission() BUILTIN;
 closure color background() BUILTIN;
 closure color holdout() BUILTIN;
 closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
 
 // BSSRDF
 closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN;
 closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN;
 closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN;
+closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN;
 
 // Hair
 closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..e2762a85fc8
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+		branched_state->name = kernel_split_state.name[ray_index];
+
+	BRANCHED_STORE(path_state);
+	BRANCHED_STORE(throughput);
+	BRANCHED_STORE(ray);
+	BRANCHED_STORE(sd);
+	BRANCHED_STORE(isect);
+	BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+	/* set loop counters to intial position */
+	branched_state->next_closure = 0;
+	branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* restore state */
+#define BRANCHED_RESTORE(name) \
+		kernel_split_state.name[ray_index] = branched_state->name;
+
+	BRANCHED_RESTORE(path_state);
+	BRANCHED_RESTORE(throughput);
+	BRANCHED_RESTORE(ray);
+	BRANCHED_RESTORE(sd);
+	BRANCHED_RESTORE(isect);
+	BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+	/* leave indirect loop */
+	REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+		kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+	if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+		return false;
+	}
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+		kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+	SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+	kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+	kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+	kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+	path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+	inactive_L->direct_throughput = L->direct_throughput;
+	path_radiance_copy_indirect(inactive_L, L);
+
+	ray_state[inactive_ray] = RAY_REGENERATED;
+	ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+	ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+	atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+	return true;
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state,
+                                                                                bool wait_for_shared)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = saved_sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	float3 throughput = branched_state->throughput;
+	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(ps->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(rng, i);
+
+		for(int j = branched_state->next_sample; j < num_samples; j++) {
+			if(reset_path_state) {
+				*ps = branched_state->path_state;
+			}
+
+			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+			*tp = throughput;
+
+			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+			if(!kernel_branched_path_surface_bounce(kg,
+			                                        &bsdf_rng,
+			                                        sd,
+			                                        sc,
+			                                        j,
+			                                        num_samples,
+			                                        tp,
+			                                        ps,
+			                                        L,
+			                                        bsdf_ray,
+			                                        sum_sample_weight))
+			{
+				continue;
+			}
+
+			/* update state for next iteration */
+			branched_state->next_closure = i;
+			branched_state->next_sample = j+1;
+			branched_state->num_samples = num_samples;
+
+			/* start the indirect path */
+			*tp *= num_samples_inv;
+
+			if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+				continue;
+			}
+
+			return true;
+		}
+
+		branched_state->next_sample = 0;
+	}
+
+	branched_state->next_closure = sd->num_closure;
+
+	if(wait_for_shared) {
+		branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+		if(branched_state->waiting_on_shared_samples) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 859c221d976..4c1fdd2d69c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,14 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum;
-#ifdef __SHADOW_TRICKS__
-		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
-			L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
-		}
-		else
-#endif  /* __SHADOW_TRICKS__ */
-		{
-			L_sum = path_radiance_clamp_and_sum(kg, L);
-		}
-		kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
 		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
 
 		/* accumulate result in output buffer */
-		kernel_write_pass_float4(buffer, sample, L_rad);
+		bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+		kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
 		path_rng_end(kg, rng_state, rng);
 
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 9d3d01fff75..e4545d66eff 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -67,6 +67,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
         unsigned int num_samples,
         ccl_global float *buffer)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
 #ifdef __KERNEL_OPENCL__
 	kg->data = data;
 #endif
@@ -105,21 +109,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 
 	/* Initialize queue data and queue index. */
 	if(thread_index < queuesize) {
-		/* Initialize active ray queue. */
-		kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize background and buffer update queue. */
-		kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of AO queue. */
-		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of direct lighting queue. */
-		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		}
 	}
 
 	if(thread_index == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			Queue_index[i] = 0;
+		}
+
 		/* The scene-intersect kernel should not use the queues very first time.
 		 * since the queue would be empty.
 		 */
@@ -148,6 +147,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 			*(rng_state + index) = hash_int_2d(x, y);
 		}
 	}
+
+#endif  /* KERENL_STUB */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index bdbf7387b95..3336c968a44 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
 	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
@@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 		/* direct lighting */
 #ifdef __EMISSION__
 		RNG rng = kernel_split_state.rng[ray_index];
+
 		bool flag = (kernel_data.integrator.use_direct_light &&
 		             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+		if(flag && kernel_data.integrator.branched) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __BRANCHED_PATH__ */
+
 #  ifdef __SHADOW_TRICKS__
 		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
 			flag = false;
-			ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
-			float3 throughput = kernel_split_state.throughput[ray_index];
-			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-			kernel_branched_path_surface_connect_light(kg,
-			                                           &rng,
-			                                           sd,
-			                                           emission_sd,
-			                                           state,
-			                                           throughput,
-			                                           1.0f,
-			                                           L,
-			                                           1);
+			enqueue_flag = 1;
 		}
 #  endif  /* __SHADOW_TRICKS__ */
+
 		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
 			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 					kernel_split_state.bsdf_eval[ray_index] = L_light;
 					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
@@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 #endif  /* __EMISSION__ */
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
 #ifdef __EMISSION__
 	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
 	enqueue_ray_index_local(ray_index,
@@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 	                        kernel_split_state.queue_data,
 	                        kernel_split_params.queue_index);
 #endif
+
+#ifdef __BRANCHED_PATH__
+	/* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+	 * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+	 */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_LIGHT_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 47d3c280831..9f8dd2392d9 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -16,6 +16,100 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	/* GPU: no decoupled ray marching, scatter probalistically */
+	int num_samples = kernel_data.integrator.volume_samples;
+	float num_samples_inv = 1.0f/num_samples;
+
+	Ray volume_ray = branched_state->ray;
+	volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+	for(int j = branched_state->next_sample; j < num_samples; j++) {
+		ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+		*ps = branched_state->path_state;
+
+		ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+		*pray = branched_state->ray;
+
+		ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+		*tp = branched_state->throughput * num_samples_inv;
+
+		/* branch RNG state */
+		path_state_branch(ps, j, num_samples);
+
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+			/* indirect light bounce */
+			if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+				continue;
+			}
+
+			/* start the indirect path */
+			branched_state->next_closure = 0;
+			branched_state->next_sample = j+1;
+			branched_state->num_samples = num_samples;
+
+			/* Attempting to share too many samples is slow for volumes as it causes us to
+			 * loop here more and have many calls to kernel_volume_integrate which evaluates
+			 * shaders. The many expensive shader evaluations cause the work load to become
+			 * unbalanced and many threads to become idle in this kernel. Limiting the
+			 * number of shared samples here helps quite a lot.
+			 */
+			if(branched_state->shared_sample_count < 2) {
+				if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+					continue;
+				}
+			}
+
+			return true;
+		}
+#  endif
+	}
+
+	branched_state->next_sample = num_samples;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	/* todo: avoid this calculation using decoupled ray marching */
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+	kernel_split_state.throughput[ray_index] = throughput;
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */
 
 ccl_device void kernel_do_volume(KernelGlobals *kg)
 {
@@ -23,37 +117,36 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 	/* We will empty this queue in this kernel. */
 	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
 		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+		kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
 	}
-	/* Fetch use_queues_flag. */
-	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-	if(local_use_queues_flag) {
+
+	if(*kernel_split_params.use_queues_flag) {
 		ray_index = get_ray_index(kg, ray_index,
 		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                          kernel_split_state.queue_data,
 		                          kernel_split_params.queue_size,
 		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
 	}
 
-	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 
-		bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-
-		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
-		ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+		bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
 
 		/* Sanitize volume stack. */
 		if(!hit) {
@@ -64,31 +157,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 			Ray volume_ray = *ray;
 			volume_ray.t = (hit)? isect->t: FLT_MAX;
 
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+#  ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+				bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+				{
+					/* integrate along volume segment with distance sampling */
+					VolumeIntegrateResult result = kernel_volume_integrate(
+						kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
-						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
-					else
-						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+					if(result == VOLUME_PATH_SCATTERED) {
+						/* direct lighting */
+						kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+
+						/* indirect light bounce */
+						if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+						}
+						else {
+							kernel_split_path_end(kg, ray_index);
+						}
+					}
+#  endif  /* __VOLUME_SCATTER__ */
 				}
-#  endif
+
+#  ifdef __BRANCHED_PATH__
 			}
+			else {
+				kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#  endif  /* __BRANCHED_PATH__ */
 		}
+
 		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#endif
+#  ifdef __BRANCHED_PATH__
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_VOLUME_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
 }
 
 
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+                                        ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+	/* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	char enqueue_flag = 0;
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+		enqueue_flag = 1;
+	}
+
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_INACTIVE_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif  /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 9fc853a84bf..fec671be016 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
  *     flag RAY_SHADOW_RAY_CAST_AO
  */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
         ccl_local_param BackgroundAOLocals *locals)
@@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	}
 	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
+#ifdef __AO__
 	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -122,14 +124,22 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 #ifdef __SHADOW_TRICKS__
 		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			if (state->flag & PATH_RAY_CAMERA) {
-				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+			if(state->flag & PATH_RAY_CAMERA) {
+				PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+				state->flag |= (PATH_RAY_SHADOW_CATCHER |
+				                PATH_RAY_SHADOW_CATCHER_ONLY |
+				                PATH_RAY_STORE_SHADOW_INFO);
 				state->catcher_object = sd->object;
 				if(!kernel_data.background.transparent) {
-					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 					ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-					L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+					L->shadow_background_color = indirect_background(
+					        kg,
+					        &kernel_split_state.sd_DL_shadow[ray_index],
+					        state,
+					        ray);
 				}
+				L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L);
+				L->shadow_throughput = average(throughput);
 			}
 		}
 		else {
@@ -155,8 +165,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
 			}
 			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+				kernel_split_path_end(kg, ray_index);
 			}
 		}
 #endif  /* __HOLDOUT__ */
@@ -164,18 +173,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		/* Holdout mask objects do not write data passes. */
-		kernel_write_data_passes(kg,
-		                         buffer,
-		                         L,
-		                         sd,
-		                         sample,
-		                         state,
-		                         throughput);
+
+#ifdef __BRANCHED_PATH__
+		if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif  /* __BRANCHED_PATH__ */
+		{
+			/* Holdout mask objects do not write data passes. */
+			kernel_write_data_passes(kg,
+				                     buffer,
+				                     L,
+				                     sd,
+				                     sample,
+				                     state,
+				                     throughput);
+		}
+
 		/* Blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy.
 		 */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+		if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+		if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+		   (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif  /* __BRANCHED_PATH__ */
+		{
 			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 			if(blur_pdf < 1.0f) {
 				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,85 +223,62 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
+#ifndef __BRANCHED_PATH__
 		float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+		float probability = 1.0f;
+
+		if(!kernel_data.integrator.branched) {
+			probability = path_state_terminate_probability(kg, state, throughput);
+		}
+		else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+			probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+		}
+		else if(state->flag & PATH_RAY_TRANSPARENT) {
+			probability = path_state_terminate_probability(kg, state, throughput);
+		}
+#endif
 
 		if(probability == 0.0f) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+			kernel_split_path_end(kg, ray_index);
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 			if(probability != 1.0f) {
 				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
 				if(terminate >= probability) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					enqueue_flag = 1;
+					kernel_split_path_end(kg, ray_index);
 				}
 				else {
 					kernel_split_state.throughput[ray_index] = throughput/probability;
 				}
 			}
+
+			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}
 
 #ifdef __AO__
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (sd->flag & SD_AO))
-		{
-			/* todo: solve correlation */
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
-			float3 ao_D;
-			float ao_pdf;
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray _ray;
-				_ray.P = ray_offset(sd->P, sd->Ng);
-				_ray.D = ao_D;
-				_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-				_ray.time = sd->time;
-#endif
-				_ray.dP = sd->dP;
-				_ray.dD = differential3_zero();
-				kernel_split_state.ao_light_ray[ray_index] = _ray;
-
-				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-			}
+		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+			enqueue_flag = 1;
 		}
 	}
 #endif  /* __AO__ */
-	kernel_split_state.rng[ray_index] = rng;
 
+	kernel_split_state.rng[ray_index] = rng;
 
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        &locals->queue_atomics_bg,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
-
 #ifdef __AO__
 	/* Enqueue to-shadow-ray-cast rays. */
 	enqueue_ray_index_local(ray_index,
 	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        enqueue_flag,
 	                        kernel_split_params.queue_size,
 	                        &locals->queue_atomics_ao,
 	                        kernel_split_state.queue_data,
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index 8192528622e..f0ebb90f60a 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -23,7 +23,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	int ray_index;
 
-	if(kernel_data.integrator.ao_bounces) {
+	if(kernel_data.integrator.ao_bounces != INT_MAX) {
 		ray_index = get_ray_index(kg, thread_index,
 		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 		                          kernel_split_state.queue_data,
@@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 				if(state->bounce > kernel_data.integrator.ao_bounces) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					kernel_split_path_end(kg, ray_index);
 				}
 			}
 		}
@@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 #ifdef __PASSES__
 			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
 #endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				kernel_split_path_end(kg, ray_index);
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
 			path_radiance_accum_background(L, state, (*throughput), L_background);
 #endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			kernel_split_path_end(kg, ray_index);
 		}
 	}
 
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index a56e85abeb9..82bc2f01fd7 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 
-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-		kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched) {
+#endif
+		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+			ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+			kernel_path_subsurface_accum_indirect(ss_indirect, L);
 
-		/* Trace indirect subsurface rays by restarting the loop. this uses less
-		 * stack memory than invoking kernel_path_indirect.
-		 */
-		if(ss_indirect->num_rays) {
-			kernel_path_subsurface_setup_indirect(kg,
-			                                      ss_indirect,
-			                                      state,
-			                                      ray,
-			                                      L,
-			                                      throughput);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-		}
-		else {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			/* Trace indirect subsurface rays by restarting the loop. this uses less
+			 * stack memory than invoking kernel_path_indirect.
+			 */
+			if(ss_indirect->num_rays) {
+				kernel_path_subsurface_setup_indirect(kg,
+					                                  ss_indirect,
+					                                  state,
+					                                  ray,
+					                                  L,
+					                                  throughput);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
 		}
+#ifdef __BRANCHED_PATH__
 	}
+#endif
 
 #endif  /* __SUBSURFACE__ */
 
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 1bebc16e25b..7758e35fd32 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
  *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+	/* continue in case of transparency */
+	*throughput *= shader_bsdf_transparency(kg, sd);
+
+	if(is_zero(*throughput)) {
+		kernel_split_path_end(kg, ray_index);
+	}
+	else {
+		/* Update Path State */
+		state->flag |= PATH_RAY_TRANSPARENT;
+		state->transparent_bounce++;
+
+		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+		ray->dP = sd->dP;
+		ray->dD.dx = -sd->dI.dx;
+		ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+	}
+}
+#endif  /* __BRANCHED_PATH__ */
+
 ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                             ccl_local_param unsigned int *local_queue_atomics)
 {
@@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
 	}
 
-	char enqueue_flag = 0;
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -75,102 +120,127 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
-	/* Load ShaderData structure. */
-	PathRadiance *L = NULL;
-	ccl_global PathState *state = NULL;
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 
-	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		state = &kernel_split_state.path_state[ray_index];
-		L = &kernel_split_state.path_radiance[ray_index];
-		float3 _throughput = kernel_split_state.throughput[ray_index];
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
-			// TODO(mai): investigate correctness here
-			char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
-			if(update_path_radiance) {
-				path_radiance_accum_ao(L,
-				                       _throughput,
-				                       kernel_split_state.ao_alpha[ray_index],
-				                       kernel_split_state.ao_bsdf[ray_index],
-				                       shadow,
-				                       state->bounce);
-			}
-			else {
-				path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+	bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(active) {
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+			/* Compute direct lighting and next bounce. */
+			if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+				kernel_split_path_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
 		}
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = kernel_split_state.light_ray[ray_index].P;
-			// TODO(mai): investigate correctness here
-			char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
-			BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-			if(update_path_radiance) {
-				path_radiance_accum_light(L,
-				                          _throughput,
-				                          &L_light,
-				                          shadow,
-				                          1.0f,
-				                          state->bounce,
-				                          kernel_split_state.is_lamp[ray_index]);
+		else {
+			kernel_split_branched_indirect_light_init(kg, ray_index);
+
+			if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+			                                                          ray_index,
+			                                                          1.0f,
+			                                                          &kernel_split_state.branched_state[ray_index].sd,
+			                                                          true,
+			                                                          true))
+			{
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 			}
 			else {
-				path_radiance_accum_total_light(L, _throughput, &L_light);
+				kernel_split_branched_indirect_light_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
+#endif  /* __BRANCHED_PATH__ */
+
+		kernel_split_state.rng[ray_index] = rng;
 	}
 
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
-		state = &kernel_split_state.path_state[ray_index];
-		L = &kernel_split_state.path_radiance[ray_index];
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+	/* iter loop */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+	}
 
-		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_LIGHT_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_reset_indirect(L);
+
+		if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+		                                                          ray_index,
+		                                                          1.0f,
+		                                                          &kernel_split_state.branched_state[ray_index].sd,
+		                                                          true,
+		                                                          true))
+		{
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+		else {
+			kernel_split_branched_indirect_light_end(kg, ray_index);
 		}
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __VOLUME__
+	/* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
 	}
-#endif
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
+	                        QUEUE_VOLUME_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+	/* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
 	                        kernel_split_params.queue_size,
 	                        local_queue_atomics,
 	                        kernel_split_state.queue_data,
 	                        kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
index e2e841f36d3..66ce2dfb6f1 100644
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
 	int queue_number = -1;
 
 	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
 		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 	}
 	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 5dc94caec85..45984ca509b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 	}
 
 	/* All regenerated rays become active here */
-	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+		if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+			kernel_split_path_end(kg, ray_index);
+		}
+		else
+#endif  /* __BRANCHED_PATH__ */
+		{
+			ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+		}
+	}
 
-	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
 		return;
+	}
 
 #ifdef __KERNEL_DEBUG__
 	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 0f1696e34a0..2801b32f285 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,54 +16,61 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* This kernel sets up the ShaderData structure from the values computed
+/* This kernel evaluates ShaderData structure from the values computed
  * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
  */
-ccl_device void kernel_shader_eval(KernelGlobals *kg,
-                                   ccl_local_param unsigned int *local_queue_atomics)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
 {
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		*local_queue_atomics = 0;
-	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	/* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+	int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+	if(ray_index >= queue_index) {
+		return;
+	}
 	ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+	                          QUEUE_SHADER_SORTED_RAYS,
+#endif
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
 	                          0);
 
-	char enqueue_flag = 0;
-	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
-		enqueue_flag = 1;
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
 	}
 
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        local_queue_atomics,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
-
-	/* Continue on with shader evaluation. */
-	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection isect = kernel_split_state.isect[ray_index];
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		Ray ray = kernel_split_state.ray[ray_index];
 
-		shader_setup_from_ray(kg,
-		                      &kernel_split_state.sd[ray_index],
-		                      &isect,
-		                      &ray);
+#ifndef __BRANCHED_PATH__
 		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
 		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+		ShaderContext ctx = SHADER_CONTEXT_MAIN;
+		float rbsdf = 0.0f;
+
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+		}
+
+		if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			ctx = SHADER_CONTEXT_INDIRECT;
+		}
+
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+		shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif  /* __BRANCHED_PATH__ */
+
 		kernel_split_state.rng[ray_index] = rng;
 	}
 }
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..0432689d9fa
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+                                    ccl_local_param unsigned int *local_queue_atomics)
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(ray_index >= queue_index) {
+		return;
+	}
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection isect = kernel_split_state.isect[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
+
+		shader_setup_from_ray(kg,
+		                      &kernel_split_state.sd[ray_index],
+		                      &isect,
+		                      &ray);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..297decb0bc2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+                                   ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+	int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(tid == 0) {
+		kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+	}
+
+	uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+	if(offset >= qsize) {
+		return;
+	}
+
+	int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+	uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+	ccl_local uint *local_value = &locals->local_value[0];
+	ccl_local ushort *local_index = &locals->local_index[0];
+
+	/* copy to local memory */
+	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint add = input + idx;
+		uint value = (~0);
+		if(idx < qsize) {
+			int ray_index = kernel_split_state.queue_data[add];
+			bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+			if(valid) {
+				value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+			}
+		}
+		local_value[i + lid] = value;
+		local_index[i + lid] = i + lid;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	/* skip sorting for cpu split kernel */
+#  ifdef __KERNEL_OPENCL__
+
+	/* bitonic sort */
+	for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for (uint inc = length; inc > 0; inc >>= 1) {
+			for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+				uint i = lid + ii;
+				bool direction = ((i & (length << 1)) != 0);
+				uint j = i ^ inc;
+				ushort ioff = local_index[i];
+				ushort joff = local_index[j];
+				uint iKey = local_value[ioff];
+				uint jKey = local_value[joff];
+				bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+				bool swap = smaller ^ (j < i) ^ direction;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+				local_index[i] = (swap) ? joff : ioff;
+				local_index[j] = (swap) ? ioff : joff;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+			}
+		}
+	}
+#  endif /* __KERNEL_OPENCL__ */
+
+	/* copy to destination */
+	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint lidx = local_index[i + lid];
+		uint outi = output + idx;
+		uint ini = input + offset + lidx;
+		uint value = local_value[lidx];
+		if(idx < qsize) {
+			kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+		}
+	}
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 4243e18de72..474286285a9 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
 		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
 	}
 
-	if(ray_index == QUEUE_EMPTY_SLOT)
+	if(ray_index == QUEUE_EMPTY_SLOT) {
 		return;
+	}
 
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
-		float3 shadow;
-		Ray ray = *light_ray_global;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        &kernel_split_state.sd_DL_shadow[ray_index],
-		                                        state,
-		                                        &ray,
-		                                        &shadow));
-
-		*light_ray_global = ray;
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+		kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
 	}
+#endif
+
+	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index bb8f0157965..78e61709b01 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -29,31 +29,82 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
 	}
 
+#ifdef __BRANCHED_PATH__
+	/* TODO(mai): move this somewhere else? */
+	if(thread_index == 0) {
+		/* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+		kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+	}
+#endif  /* __BRANCHED_PATH__ */
+
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.light_ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+
+	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	bool is_lamp = kernel_split_state.is_lamp[ray_index];
+
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+	bool use_branched = false;
+	int all = 0;
+
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		use_branched = true;
+		all = 1;
+	}
+#    if defined(__BRANCHED_PATH__)
+	else if(kernel_data.integrator.branched) {
+		use_branched = true;
 
-	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-		ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+		if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			all = (kernel_data.integrator.sample_all_lights_indirect);
+		}
+		else
+		{
+			all = (kernel_data.integrator.sample_all_lights_direct);
+		}
+	}
+#    endif  /* __BRANCHED_PATH__ */
 
+	if(use_branched) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           &rng,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           all);
+	}
+	else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+	{
+		/* trace shadow ray */
 		float3 shadow;
-		Ray ray = *light_ray_global;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        &kernel_split_state.sd_DL_shadow[ray_index],
-		                                        state,
-		                                        &ray,
-		                                        &shadow));
-
-		*light_ray_global = ray;
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
+
+		if(!shadow_blocked(kg,
+			               emission_sd,
+			               state,
+			               &ray,
+			               &shadow))
+		{
+			/* accumulate */
+			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+		}
+		else {
+			path_radiance_accum_total_light(L, state, throughput, &L_light);
+		}
 	}
+
+	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 4303ba0a905..08f0124b529 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -37,41 +37,55 @@
 
 #include "util/util_atomic.h"
 
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
 #endif
 
-#ifdef __VOLUME__
-#  include "kernel/kernel_volume.h"
-#endif
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
 
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
+#endif
 
-#ifdef __KERNEL_DEBUG__
-#  include "kernel/kernel_debug.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+	if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+		int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_accum_sample(orig_ray_L, L, 1);
+
+		atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+	}
+	else {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+	}
+#else
+	ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 #endif
+}
 
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 17e6587883a..eac22050a38 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -31,14 +31,6 @@ ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_
 	size = size SPLIT_DATA_ENTRIES;
 #undef SPLIT_DATA_ENTRY
 
-#ifdef __SUBSURFACE__
-	size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */
-#endif
-
-#ifdef __VOLUME__
-	size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */
-#endif
-
 	return size;
 }
 
@@ -57,16 +49,6 @@ ccl_device_inline void split_data_init(KernelGlobals *kg,
 	SPLIT_DATA_ENTRIES;
 #undef SPLIT_DATA_ENTRY
 
-#ifdef __SUBSURFACE__
-	split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p;
-	p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16);
-#endif
-
-#ifdef __VOLUME__
-	split_data->state_shadow = (ccl_global PathState*)p;
-	p += align_up(2 * num_elements * sizeof(PathState), 16);
-#endif
-
 	split_data->ray_state = ray_state;
 }
 
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 748197b7183..4bb2f0d3d80 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -43,6 +43,9 @@ typedef struct SplitParams {
 	ccl_global char *use_queues_flag;
 
 	ccl_global float *buffer;
+
+	/* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
+	int dummy_sd_flag;
 } SplitParams;
 
 /* Global memory variables [porting]; These memory is used for
@@ -59,7 +62,64 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
 #else
 #  define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif  /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+	/* various state that must be kept and restored after an indirect loop */
+	PathState path_state;
+	float3 throughput;
+	Ray ray;
+
+	struct ShaderData sd;
+	Intersection isect;
+
+	char ray_state;
+
+	/* indirect loop state */
+	int next_closure;
+	int next_sample;
+	int num_samples;
+
+#ifdef __SUBSURFACE__
+	int ss_next_closure;
+	int ss_next_sample;
+	int next_hit;
+	int num_hits;
+
+	uint lcg_state;
+	SubsurfaceIntersection ss_isect;
+
+#  ifdef __VOLUME__
+	VolumeStack volume_stack[VOLUME_STACK_SIZE];
+#  endif  /* __VOLUME__ */
+#endif  /*__SUBSURFACE__ */
+
+	int shared_sample_count; /* number of branched samples shared with other threads */
+	int original_ray; /* index of original ray when sharing branched samples */
+	bool waiting_on_shared_samples;
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */
+
+#ifdef __SUBSURFACE__
+#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
+#else
+#  define SPLIT_DATA_SUBSURFACE_ENTRIES
+#endif /* __SUBSURFACE__ */
+
+#ifdef __VOLUME__
+#  define SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
+#else
+#  define SPLIT_DATA_VOLUME_ENTRIES
+#endif /* __VOLUME__ */
 
 #define SPLIT_DATA_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@@ -69,9 +129,6 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
-	SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
 	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@@ -79,6 +136,28 @@ typedef struct SplitParams {
 	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_DEBUG_ENTRIES \
+
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
 	SPLIT_DATA_DEBUG_ENTRIES \
 
 /* struct that holds pointers to data in the shared state buffer */
@@ -87,14 +166,6 @@ typedef struct SplitData {
 	SPLIT_DATA_ENTRIES
 #undef SPLIT_DATA_ENTRY
 
-#ifdef __SUBSURFACE__
-	ccl_global SubsurfaceIndirectRays *ss_rays;
-#endif
-
-#ifdef __VOLUME__
-	ccl_global PathState *state_shadow;
-#endif
-
 	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
 	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
 	 */
@@ -122,6 +193,11 @@ typedef struct BackgroundAOLocals {
 	uint queue_atomics_ao;
 } BackgroundAOLocals;
 
+typedef struct ShaderSortLocals {
+	uint local_value[SHADER_SORT_BLOCK_SIZE];
+	ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
 CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 0b4d50c70ee..d5083b23f80 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -16,82 +16,306 @@
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-		*local_queue_atomics = 0;
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	branched_state->ss_next_closure = 0;
+	branched_state->ss_next_sample = 0;
+
+	branched_state->num_hits = 0;
+	branched_state->next_hit = 0;
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &branched_state->sd;
+	RNG rng = kernel_split_state.rng[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
+		{
+			branched_state->lcg_state = lcg_state_init(&rng,
+			                                           branched_state->path_state.rng_offset,
+			                                           branched_state->path_state.sample,
+			                                           0x68bc21eb);
+		}
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg,
+			                     &bssrdf_rng,
+			                     &branched_state->path_state,
+			                     j,
+			                     num_samples,
+			                     PRNG_BSDF_U,
+			                     &bssrdf_u,
+			                     &bssrdf_v);
+
+			/* intersection is expensive so avoid doing multiple times for the same input */
+			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+				RNG lcg_state = branched_state->lcg_state;
+				SubsurfaceIntersection ss_isect_private;
+
+				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+				                                                              &ss_isect_private,
+				                                                              sd,
+				                                                              sc,
+				                                                              &lcg_state,
+				                                                              bssrdf_u, bssrdf_v,
+				                                                              true);
+
+				branched_state->lcg_state = lcg_state;
+				*ss_isect = ss_isect_private;
+			}
+
+#ifdef __VOLUME__
+			Ray volume_ray = branched_state->ray;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+				ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+				*bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+				                   * important as the indirect path will write into bssrdf_sd */
+
+				SubsurfaceIntersection ss_isect_private = *ss_isect;
+				subsurface_scatter_multi_setup(kg,
+				                               &ss_isect_private,
+				                               hit,
+				                               bssrdf_sd,
+				                               &branched_state->path_state,
+				                               branched_state->path_state.flag,
+				                               sc,
+				                               true);
+				*ss_isect = ss_isect_private;
+
+				ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+				*hit_state = branched_state->path_state;
+
+				path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+					/* this next part is expensive as it does scene intersection so only do once */
+					if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+						for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+							branched_state->volume_stack[k] = hit_state->volume_stack[k];
+						}
+
+						kernel_volume_stack_update_for_subsurface(kg,
+						                                          emission_sd,
+						                                          &volume_ray,
+						                                          branched_state->volume_stack);
+					}
+
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state->volume_stack[k] = branched_state->volume_stack[k];
+					}
+				}
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+				if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+					/* direct light */
+					if(kernel_data.integrator.use_direct_light) {
+						int all = (kernel_data.integrator.sample_all_lights_direct) ||
+							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+						kernel_branched_path_surface_connect_light(kg,
+						                                           &rng,
+						                                           bssrdf_sd,
+						                                           emission_sd,
+						                                           hit_state,
+						                                           branched_state->throughput,
+						                                           num_samples_inv,
+						                                           L,
+						                                           all);
+					}
+				}
+#endif  /* __EMISSION__ */
+
+				/* indirect light */
+				if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+				                                                          ray_index,
+				                                                          num_samples_inv,
+				                                                          bssrdf_sd,
+				                                                          false,
+				                                                          false))
+				{
+					branched_state->ss_next_closure = i;
+					branched_state->ss_next_sample = j;
+					branched_state->next_hit = hit;
+
+					return true;
+				}
+
+				branched_state->next_closure = 0;
+			}
+
+			branched_state->next_hit = 0;
+		}
+
+		branched_state->ss_next_sample = 0;
+	}
+
+	branched_state->ss_next_closure = sd->num_closure;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
 	}
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 	ray_index = get_ray_index(kg, ray_index,
 	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
 	                          kernel_split_state.queue_data,
 	                          kernel_split_params.queue_size,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-	char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+	                          1);
+	get_ray_index(kg, thread_index,
+	              QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
 
+#ifdef __SUBSURFACE__
 	ccl_global char *ray_state = kernel_split_state.ray_state;
-	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
-	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-	ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
 		if(sd->flag & SD_BSSRDF) {
-			if(kernel_path_subsurface_scatter(kg,
-			                                  sd,
-			                                  emission_sd,
-			                                  L,
-			                                  state,
-			                                  &rng,
-			                                  ray,
-			                                  throughput,
-			                                  ss_indirect)) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched) {
+#endif
+				if(kernel_path_subsurface_scatter(kg,
+				                                  sd,
+				                                  emission_sd,
+				                                  L,
+				                                  state,
+				                                  &rng,
+				                                  ray,
+				                                  throughput,
+				                                  ss_indirect))
+				{
+					kernel_split_path_end(kg, ray_index);
+				}
+#ifdef __BRANCHED_PATH__
+			}
+			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+				float bssrdf_probability;
+				ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+				/* modify throughput for picking bssrdf or bsdf */
+				*throughput *= bssrdf_probability;
+
+				/* do bssrdf scatter step if we picked a bssrdf closure */
+				if(sc) {
+					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+					float bssrdf_u, bssrdf_v;
+					path_state_rng_2D(kg,
+					                  &rng,
+					                  state,
+					                  PRNG_BSDF_U,
+					                  &bssrdf_u, &bssrdf_v);
+					subsurface_scatter_step(kg,
+					                        sd,
+					                        state,
+					                        state->flag,
+					                        sc,
+					                        &lcg_state,
+					                        bssrdf_u, bssrdf_v,
+					                        false);
+				}
+			}
+			else {
+				kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
 			}
+#endif
 		}
 		kernel_split_state.rng[ray_index] = rng;
 	}
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
 	}
-#endif
 
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        kernel_split_params.queue_size,
-	                        local_queue_atomics,
-	                        kernel_split_state.queue_data,
-	                        kernel_split_params.queue_index);
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
 
 #endif  /* __SUBSURFACE__ */
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 1885e1af851..4268813b263 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -76,6 +76,345 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
 
 	switch(type) {
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_ID: {
+			uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
+				sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset,
+				anisotropic_rotation_offset, transmission_roughness_offset;
+			uint4 data_node2 = read_node(kg, offset);
+
+			float3 T = stack_load_float3(stack, data_node.y);
+			decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
+			decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset);
+			decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
+
+			// get Disney principled parameters
+			float metallic = param1;
+			float subsurface = param2;
+			float specular = stack_load_float(stack, specular_offset);
+			float roughness = stack_load_float(stack, roughness_offset);
+			float specular_tint = stack_load_float(stack, specular_tint_offset);
+			float anisotropic = stack_load_float(stack, anisotropic_offset);
+			float sheen = stack_load_float(stack, sheen_offset);
+			float sheen_tint = stack_load_float(stack, sheen_tint_offset);
+			float clearcoat = stack_load_float(stack, clearcoat_offset);
+			float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset);
+			float transmission = stack_load_float(stack, transmission_offset);
+			float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
+			float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
+			float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f);
+
+			ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+
+			/* rotate tangent */
+			if(anisotropic_rotation != 0.0f)
+				T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
+
+			/* calculate ior */
+			float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
+
+			// calculate fresnel for refraction
+			float cosNO = dot(N, sd->I);
+			float fresnel = fresnel_dielectric_cos(cosNO, ior);
+
+			// calculate weights of the diffuse and specular part
+			float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission));
+			
+			float final_transmission = saturate(transmission) * (1.0f - saturate(metallic));
+			float specular_weight = (1.0f - final_transmission);
+
+			// get the base color
+			uint4 data_base_color = read_node(kg, offset);
+			float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) :
+				make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w));
+
+			// get the additional clearcoat normal and subsurface scattering radius
+			uint4 data_cn_ssr = read_node(kg, offset);
+			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
+			float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
+
+			// get the subsurface color
+			uint4 data_subsurface_color = read_node(kg, offset);
+			float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
+				make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
+
+			float3 weight = sd->svm_closure_weight * mix_weight;
+
+#ifdef __SUBSURFACE__
+			float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface);
+			float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
+			float subsurf_sample_weight = fabsf(average(subsurf_weight));
+
+			/* disable in case of diffuse ancestor, can't see it well then and
+			 * adds considerably noise due to probabilities of continuing path
+			 * getting lower and lower */
+			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+				subsurface = 0.0f;
+
+				/* need to set the base color in this case such that the
+				 * rays get the correctly mixed color after transmitting
+				 * the object */
+				base_color = mixed_ss_base_color;
+			}
+
+			/* diffuse */
+			if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
+				if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+					float3 diff_weight = weight * base_color * diffuse_weight;
+
+					PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+					if(bsdf) {
+						bsdf->N = N;
+						bsdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+					}
+				}
+				else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) {
+					/* radius * scale */
+					float3 radius = subsurface_radius * subsurface;
+					/* sharpness */
+					float sharpness = 0.0f;
+					/* texture color blur */
+					float texture_blur = 0.0f;
+
+					/* create one closure per color channel */
+					Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f));
+					if(bssrdf) {
+						bssrdf->sample_weight = subsurf_sample_weight;
+						bssrdf->radius = radius.x;
+						bssrdf->texture_blur = texture_blur;
+						bssrdf->albedo = subsurface_color.x;
+						bssrdf->sharpness = sharpness;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+					}
+
+					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f));
+					if(bssrdf) {
+						bssrdf->sample_weight = subsurf_sample_weight;
+						bssrdf->radius = radius.y;
+						bssrdf->texture_blur = texture_blur;
+						bssrdf->albedo = subsurface_color.y;
+						bssrdf->sharpness = sharpness;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+					}
+
+					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z));
+					if(bssrdf) {
+						bssrdf->sample_weight = subsurf_sample_weight;
+						bssrdf->radius = radius.z;
+						bssrdf->texture_blur = texture_blur;
+						bssrdf->albedo = subsurface_color.z;
+						bssrdf->sharpness = sharpness;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+					}
+				}
+			}
+#else
+			/* diffuse */
+			if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+				float3 diff_weight = weight * base_color * diffuse_weight;
+
+				PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+				if(bsdf) {
+					bsdf->N = N;
+					bsdf->roughness = roughness;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+				}
+			}
+#endif
+
+			/* sheen */
+			if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) {
+				float m_cdlum = linear_rgb_to_gray(base_color);
+				float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat
+
+				/* color of the sheen component */
+				float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint;
+
+				float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight;
+
+				PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight);
+
+				if(bsdf) {
+					bsdf->N = N;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_principled_sheen_setup(bsdf);
+				}
+			}
+
+			/* specular reflection */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) {
+					float3 spec_weight = weight * specular_weight;
+
+					MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight);
+					MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+					if(bsdf && extra) {
+						bsdf->N = N;
+						bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
+						bsdf->T = T;
+						bsdf->extra = extra;
+
+						float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
+						float r2 = roughness * roughness;
+
+						bsdf->alpha_x = r2 / aspect;
+						bsdf->alpha_y = r2 * aspect;
+
+						float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
+						float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
+						float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint;
+
+						bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic;
+						bsdf->extra->color = base_color;
+
+						/* setup bsdf */
+						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
+							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+						else /* use multi-scatter GGX */
+							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			/* BSDF */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(final_transmission > CLOSURE_WEIGHT_CUTOFF) {
+					float3 glass_weight = weight * final_transmission;
+					float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
+
+					if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
+						float refl_roughness = roughness;
+
+						/* reflection */
+#ifdef __CAUSTICS_TRICKS__
+						if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+						{
+							MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel);
+							MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+							if(bsdf && extra) {
+								bsdf->N = N;
+								bsdf->extra = extra;
+
+								bsdf->alpha_x = refl_roughness * refl_roughness;
+								bsdf->alpha_y = refl_roughness * refl_roughness;
+								bsdf->ior = ior;
+
+								bsdf->extra->color = base_color;
+								bsdf->extra->cspec0 = cspec0;
+
+								/* setup bsdf */
+								sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+							}
+						}
+
+						/* refraction */
+#ifdef __CAUSTICS_TRICKS__
+						if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+						{
+							MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel));
+
+							if(bsdf) {
+								bsdf->N = N;
+
+								if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
+									transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness);
+								else
+									transmission_roughness = refl_roughness;
+
+								bsdf->alpha_x = transmission_roughness * transmission_roughness;
+								bsdf->alpha_y = transmission_roughness * transmission_roughness;
+								bsdf->ior = ior;
+
+								/* setup bsdf */
+								sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+							}
+						}
+					}
+					else { /* use multi-scatter GGX */
+						MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight);
+						MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+						if(bsdf && extra) {
+							bsdf->N = N;
+							bsdf->extra = extra;
+							bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+
+							bsdf->alpha_x = roughness * roughness;
+							bsdf->alpha_y = roughness * roughness;
+							bsdf->ior = ior;
+
+							bsdf->extra->color = base_color;
+							bsdf->extra->cspec0 = cspec0;
+
+							/* setup bsdf */
+							sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+						}
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			/* clearcoat */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(clearcoat > CLOSURE_WEIGHT_CUTOFF) {
+					MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
+					MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+					if(bsdf && extra) {
+						bsdf->N = clearcoat_normal;
+						bsdf->ior = 1.5f;
+						bsdf->extra = extra;
+
+						bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+						bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+
+						bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+						bsdf->extra->clearcoat = clearcoat;
+
+						/* setup bsdf */
+						sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			break;
+		}
+#endif  /* __PRINCIPLED__ */
 		case CLOSURE_BSDF_DIFFUSE_ID: {
 			float3 weight = sd->svm_closure_weight * mix_weight;
 			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
@@ -110,6 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 			if(bsdf) {
+				bsdf->N = N;
 				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
@@ -344,6 +684,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __CAUSTICS_TRICKS__
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
+			ATTR_FALLTHROUGH;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
 			float3 weight = sd->svm_closure_weight * mix_weight;
@@ -370,6 +711,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					/* todo: giving a fixed weight here will cause issues when
 					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
@@ -383,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					bsdf->roughness1 = param1;
 					bsdf->roughness2 = param2;
 					bsdf->offset = -stack_load_float(stack, data_node.z);
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index c94fa130af7..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	strength = max(strength, 0.0f);
 
 	/* compute and output perturbed normal */
-	float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
-	normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+	if(is_zero(normal_out)) {
+		normal_out = normal_in;
+	}
+	else {
+		normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	}
 
 	if(use_object_space) {
 		object_normal_transform(kg, sd, &normal_out);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 4a09d9f6653..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 #ifdef __UV__
 		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
+		default: data = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	stack_store_float3(stack, out_offset, data);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 76acc9253a1..7be03dcd65a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,29 +16,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-#  define TEX_NUM_FLOAT4_IMAGES		TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-#  if __CUDA_ARCH__ < 300
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA
-#  else
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA_KEPLER
-#  endif
-#else
-#  define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_OPENCL
-#endif
-
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
 #ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-	ssef r_ssef;
-	float4 &r = (float4 &)r_ssef;
-	r = kernel_tex_image_interp(id, x, y);
-#  else
 	float4 r = kernel_tex_image_interp(id, x, y);
-#  endif
 #elif defined(__KERNEL_OPENCL__)
 	float4 r = kernel_tex_image_interp(kg, id, x, y);
 #else
@@ -56,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 	switch(id) {
 		case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
-		case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
-		case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
-		case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
-		case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
-		case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
-		case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
-		case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
-		case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
+		case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break;
+		case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break;
+		case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break;
+		case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break;
+		case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break;
 		case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
-		case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
-		case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
-		case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
-		case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
-		case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
-		case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
-		case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
 		case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
-		case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
-		case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
-		case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
-		case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
-		case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
-		case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
-		case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
 		case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
-		case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
-		case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
-		case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
-		case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
-		case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
-		case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
-		case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
 		case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
-		case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
-		case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
-		case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
-		case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
-		case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
-		case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
-		case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
 		case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
-		case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
-		case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
-		case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
-		case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
-		case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
-		case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
-		case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
 		case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
-		case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
-		case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
-		case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
-		case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
-		case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
-		case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
-		case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
 		case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
-		case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
-		case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
-		case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
-		case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
-		case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
-		case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
-		case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
 		case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
-		case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
-		case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
-		case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
-		case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
-		case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
-		case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
-		case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
 		case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
-		case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
-		case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
-		case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
-		case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
-		case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
-		case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
-		case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
 		case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
-		case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
-		case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
-		case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
-		case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
-		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
-		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
-		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
+		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
+		case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
+		case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
+		case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
+		case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
+		case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
+		case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
+		case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
+		case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break;
+		case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break;
+		case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break;
+		case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break;
+		case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break;
+		case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break;
+		case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break;
+		case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break;
+		case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break;
+		case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break;
+		case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break;
+		case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break;
+		case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break;
+		case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break;
+		case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break;
+		case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break;
+		case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break;
+		case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break;
+		case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break;
+		case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break;
+		case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break;
+		case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break;
+		case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break;
+		case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break;
+		case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break;
+		case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break;
+		case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break;
+		case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break;
+		case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break;
+		case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break;
+		case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break;
+		case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break;
+		case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break;
+		case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break;
+		case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break;
+		case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break;
+		case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break;
+		case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break;
+		case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break;
+		case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break;
+		case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break;
+		case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break;
+		case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break;
+		case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break;
+		case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break;
+		case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break;
+		case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break;
+		case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break;
+		case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break;
+		case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break;
+		case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break;
+		case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break;
+		case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break;
+		case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break;
+		case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break;
+		case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break;
+		case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break;
+		case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break;
+		case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break;
+		case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break;
+		case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break;
+		case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break;
+		case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break;
+		case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break;
+		case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break;
 		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -151,8 +132,13 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 #  else
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
 	/* float4, byte4 and half4 */
-	if(id < TEX_START_FLOAT_CUDA_KEPLER)
+	const int texture_type = kernel_tex_type(id);
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
 		r = kernel_tex_image_interp_float4(tex, x, y);
+	}
 	/* float, byte and half */
 	else {
 		float f = kernel_tex_image_interp_float(tex, x, y);
@@ -161,40 +147,22 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 #  endif
 #endif
 
-#ifdef __KERNEL_SSE2__
-	float alpha = r.w;
+	const float alpha = r.w;
 
 	if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
-		r_ssef = r_ssef / ssef(alpha);
-		if(id >= TEX_NUM_FLOAT4_IMAGES)
-			r_ssef = min(r_ssef, ssef(1.0f));
-		r.w = alpha;
-	}
-
-	if(srgb) {
-		r_ssef = color_srgb_to_scene_linear(r_ssef);
-		r.w = alpha;
-	}
-#else
-	if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
-		float invw = 1.0f/r.w;
-		r.x *= invw;
-		r.y *= invw;
-		r.z *= invw;
-
-		if(id >= TEX_NUM_FLOAT4_IMAGES) {
-			r.x = min(r.x, 1.0f);
-			r.y = min(r.y, 1.0f);
-			r.z = min(r.z, 1.0f);
+		r /= alpha;
+		const int texture_type = kernel_tex_type(id);
+		if(texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+		   texture_type == IMAGE_DATA_TYPE_BYTE)
+		{
+			r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
 		}
+		r.w = alpha;
 	}
 
 	if(srgb) {
-		r.x = color_srgb_to_scene_linear(r.x);
-		r.y = color_srgb_to_scene_linear(r.y);
-		r.z = color_srgb_to_scene_linear(r.z);
+		r = color_srgb_to_scene_linear_v4(r);
 	}
-#endif
 
 	return r;
 }
@@ -336,8 +304,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
 	float3 co = stack_load_float3(stack, co_offset);
 	float2 uv;
 
-	co = normalize(co);
-	
+	co = safe_normalize(co);
+
 	if(projection == 0)
 		uv = direction_to_equirectangular(co);
 	else
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 47209ddfbab..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -397,17 +397,23 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_DIFFUSE_ID,
 	CLOSURE_BSDF_OREN_NAYAR_ID,
 	CLOSURE_BSDF_DIFFUSE_RAMP_ID,
+	CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
+	CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
 	/* Glossy */
-	CLOSURE_BSDF_GLOSSY_ID,
 	CLOSURE_BSDF_REFLECTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
@@ -416,24 +422,26 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
 
 	/* Transmission */
-	CLOSURE_BSDF_TRANSMISSION_ID,
 	CLOSURE_BSDF_TRANSLUCENT_ID,
 	CLOSURE_BSDF_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
-	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
 	CLOSURE_BSDF_SHARP_GLASS_ID,
 	CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
 
 	/* Special cases */
 	CLOSURE_BSDF_BSSRDF_ID,
+	CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID,
 	CLOSURE_BSDF_TRANSPARENT_ID,
 
 	/* BSSRDF */
 	CLOSURE_BSSRDF_CUBIC_ID,
 	CLOSURE_BSSRDF_GAUSSIAN_ID,
+	CLOSURE_BSSRDF_PRINCIPLED_ID,
 	CLOSURE_BSSRDF_BURLEY_ID,
 
 	/* Other */
@@ -447,19 +455,24 @@ typedef enum ClosureType {
 	CLOSURE_VOLUME_ABSORPTION_ID,
 	CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID,
 
+	CLOSURE_BSDF_PRINCIPLED_ID,
+
 	NBUILTIN_CLOSURES
 } ClosureType;
 
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
-#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
 #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
                                             type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
-											type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+                                            type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+                                          (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -468,7 +481,8 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 9e826c8c23f..f4a5b2b2994 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -46,8 +46,13 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 #  if defined(__KERNEL_CUDA__)
 #    if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	if(id < TEX_START_HALF4_CUDA_KEPLER)
+	const int texture_type = kernel_tex_type(id);
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
 		r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
+	}
 	else {
 		float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
 		r = make_float4(f, f, f, 1.0f);
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fe2c2e78926..cf402c3f214 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,9 @@ BufferParams::BufferParams()
 	full_width = 0;
 	full_height = 0;
 
+	denoising_data_pass = false;
+	denoising_clean_pass = false;
+
 	Pass::add(PASS_COMBINED, passes);
 }
 
@@ -68,10 +71,25 @@ int BufferParams::get_passes_size()
 
 	for(size_t i = 0; i < passes.size(); i++)
 		size += passes[i].components;
-	
+
+	if(denoising_data_pass) {
+		size += DENOISING_PASS_SIZE_BASE;
+		if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+	}
+
 	return align_up(size, 4);
 }
 
+int BufferParams::get_denoising_offset()
+{
+	int offset = 0;
+
+	for(size_t i = 0; i < passes.size(); i++)
+		offset += passes[i].components;
+
+	return offset;
+}
+
 /* Render Buffer Task */
 
 RenderTile::RenderTile()
@@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
 }
 
-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::copy_from_device(Device *from_device)
 {
 	if(!buffer.device_pointer)
 		return false;
 
-	device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+	if(!from_device) {
+		from_device = device;
+	}
+
+	from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+
+	return true;
+}
+
+bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
+{
+	float scale = 1.0f/sample;
+
+	if(offset == DENOISING_PASS_COLOR) {
+		scale *= exposure;
+	}
+	else if(offset == DENOISING_PASS_COLOR_VAR) {
+		scale *= exposure*exposure;
+	}
+
+	offset += params.get_denoising_offset();
+	float *in = (float*)buffer.data_pointer + offset;
+	int pass_stride = params.get_passes_size();
+	int size = params.width*params.height;
+
+	if(components == 1) {
+		for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+			pixels[0] = in[0]*scale;
+		}
+	}
+	else if(components == 3) {
+		for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+			pixels[0] = in[0]*scale;
+			pixels[1] = in[1]*scale;
+			pixels[2] = in[2]*scale;
+		}
+	}
+	else {
+		return false;
+	}
 
 	return true;
 }
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 5c78971678a..e56556c8abe 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -51,6 +51,9 @@ public:
 
 	/* passes */
 	array<Pass> passes;
+	bool denoising_data_pass;
+	/* If only some light path types should be denoised, an additional pass is needed. */
+	bool denoising_clean_pass;
 
 	/* functions */
 	BufferParams();
@@ -59,6 +62,7 @@ public:
 	bool modified(const BufferParams& params);
 	void add_pass(PassType type);
 	int get_passes_size();
+	int get_denoising_offset();
 };
 
 /* Render Buffers */
@@ -73,18 +77,19 @@ public:
 	/* random number generator state */
 	device_vector<uint> rng_state;
 
+	Device *device;
+
 	explicit RenderBuffers(Device *device);
 	~RenderBuffers();
 
 	void reset(Device *device, BufferParams& params);
 
-	bool copy_from_device();
+	bool copy_from_device(Device *from_device = NULL);
 	bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
+	bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
 
 protected:
 	void device_free();
-
-	Device *device;
 };
 
 /* Display Buffer
@@ -131,6 +136,9 @@ protected:
 
 class RenderTile {
 public:
+	typedef enum { PATH_TRACE, DENOISE } Task;
+
+	Task task;
 	int x, y, w, h;
 	int start_sample;
 	int num_samples;
@@ -138,6 +146,7 @@ public:
 	int resolution;
 	int offset;
 	int stride;
+	int tile_index;
 
 	device_ptr buffer;
 	device_ptr rng_state;
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index 2569d9eec27..943b218f0e4 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -160,6 +160,14 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp)
 		bypass(input->link);
 		return true;
 	}
+	else {
+		/* disconnect other inputs if we can't fully bypass due to clamp */
+		foreach(ShaderInput *other, node->inputs) {
+			if(other != input && other->link) {
+				graph->disconnect(other);
+			}
+		}
+	}
 
 	return false;
 }
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 7809f4345f1..c8213d258d5 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -279,6 +279,10 @@ NODE_DEFINE(Film)
 
 	SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
 
+	SOCKET_BOOLEAN(denoising_data_pass,  "Generate Denoising Data Pass",  false);
+	SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+	SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+
 	return type;
 }
 
@@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 		kfilm->pass_stride += pass.components;
 	}
 
+	kfilm->pass_denoising_data = 0;
+	kfilm->pass_denoising_clean = 0;
+	kfilm->denoising_flags = 0;
+	if(denoising_data_pass) {
+		kfilm->pass_denoising_data = kfilm->pass_stride;
+		kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
+		kfilm->denoising_flags = denoising_flags;
+		if(denoising_clean_pass) {
+			kfilm->pass_denoising_clean = kfilm->pass_stride;
+			kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
+			kfilm->use_light_pass = 1;
+		}
+	}
+
 	kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
 	kfilm->pass_alpha_threshold = pass_alpha_threshold;
 
@@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
 	kfilm->mist_falloff = mist_falloff;
 
+	pass_stride = kfilm->pass_stride;
+	denoising_data_offset = kfilm->pass_denoising_data;
+	denoising_clean_offset = kfilm->pass_denoising_clean;
+
 	need_update = false;
 }
 
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 83c941d5c57..29b1e7e9157 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -57,8 +57,15 @@ public:
 
 	float exposure;
 	array<Pass> passes;
+	bool denoising_data_pass;
+	bool denoising_clean_pass;
+	int denoising_flags;
 	float pass_alpha_threshold;
 
+	int pass_stride;
+	int denoising_data_offset;
+	int denoising_clean_offset;
+
 	FilterType filter_type;
 	float filter_width;
 	size_t filter_table_offset;
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 12fff8e5587..2d810ff664f 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -423,7 +423,8 @@ void ShaderGraph::copy_nodes(ShaderNodeSet& nodes, ShaderNodeMap& nnodemap)
 /* Graph simplification */
 /* ******************** */
 
-/* Step 1: Remove proxy nodes.
+/* Remove proxy nodes.
+ *
  * These only exists temporarily when exporting groups, and we must remove them
  * early so that node->attributes() and default links do not see them.
  */
@@ -493,7 +494,8 @@ void ShaderGraph::remove_proxy_nodes()
 	}
 }
 
-/* Step 2: Constant folding.
+/* Constant folding.
+ *
  * Try to constant fold some nodes, and pipe result directly to
  * the input socket of connected nodes.
  */
@@ -554,7 +556,7 @@ void ShaderGraph::constant_fold()
 	}
 }
 
-/* Step 3: Simplification. */
+/* Simplification. */
 void ShaderGraph::simplify_settings(Scene *scene)
 {
 	foreach(ShaderNode *node, nodes) {
@@ -562,7 +564,7 @@ void ShaderGraph::simplify_settings(Scene *scene)
 	}
 }
 
-/* Step 4: Deduplicate nodes with same settings. */
+/* Deduplicate nodes with same settings. */
 void ShaderGraph::deduplicate_nodes()
 {
 	/* NOTES:
@@ -638,6 +640,48 @@ void ShaderGraph::deduplicate_nodes()
 	}
 }
 
+/* Check whether volume output has meaningful nodes, otherwise
+ * disconnect the output.
+ */
+void ShaderGraph::verify_volume_output()
+{
+	/* Check whether we can optimize the whole volume graph out. */
+	ShaderInput *volume_in = output()->input("Volume");
+	if(volume_in->link == NULL) {
+		return;
+	}
+	bool has_valid_volume = false;
+	ShaderNodeSet scheduled;
+	queue<ShaderNode*> traverse_queue;
+	/* Schedule volume output. */
+	traverse_queue.push(volume_in->link->parent);
+	scheduled.insert(volume_in->link->parent);
+	/* Traverse down the tree. */
+	while(!traverse_queue.empty()) {
+		ShaderNode *node = traverse_queue.front();
+		traverse_queue.pop();
+		/* Node is fully valid for volume, can't optimize anything out. */
+		if(node->has_volume_support()) {
+			has_valid_volume = true;
+			break;
+		}
+		foreach(ShaderInput *input, node->inputs) {
+			if(input->link == NULL) {
+				continue;
+			}
+			if(scheduled.find(input->link->parent) != scheduled.end()) {
+				continue;
+			}
+			traverse_queue.push(input->link->parent);
+			scheduled.insert(input->link->parent);
+		}
+	}
+	if(!has_valid_volume) {
+		VLOG(1) << "Disconnect meaningless volume output.";
+		disconnect(volume_in->link);
+	}
+}
+
 void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack)
 {
 	visited[node->id] = true;
@@ -666,16 +710,11 @@ void ShaderGraph::clean(Scene *scene)
 {
 	/* Graph simplification */
 
-	/* 1: Remove proxy nodes was already done. */
-
-	/* 2: Constant folding. */
+	/* NOTE: Remove proxy nodes was already done. */
 	constant_fold();
-
-	/* 3: Simplification. */
 	simplify_settings(scene);
-
-	/* 4: De-duplication. */
 	deduplicate_nodes();
+	verify_volume_output();
 
 	/* we do two things here: find cycles and break them, and remove unused
 	 * nodes that don't feed into the output. how cycles are broken is
@@ -998,6 +1037,9 @@ int ShaderGraph::get_num_closures()
 		else if(CLOSURE_IS_BSDF_MULTISCATTER(closure_type)) {
 			num_closures += 2;
 		}
+		else if(CLOSURE_IS_PRINCIPLED(closure_type)) {
+			num_closures += 8;
+		}
 		else {
 			++num_closures;
 		}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 09932695d1f..72e391991a7 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -155,7 +155,7 @@ public:
 	virtual bool has_spatial_varying() { return false; }
 	virtual bool has_object_dependency() { return false; }
 	virtual bool has_integrator_dependency() { return false; }
-
+	virtual bool has_volume_support() { return false; }
 	vector<ShaderInput*> inputs;
 	vector<ShaderOutput*> outputs;
 
@@ -284,6 +284,7 @@ protected:
 	void constant_fold();
 	void simplify_settings(Scene *scene);
 	void deduplicate_nodes();
+	void verify_volume_output();
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index a8c4f446bea..02b65440154 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -30,6 +30,16 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Some helpers to silence warning in templated function. */
+static bool isfinite(uchar /*value*/)
+{
+	return false;
+}
+static bool isfinite(half /*value*/)
+{
+	return false;
+}
+
 ImageManager::ImageManager(const DeviceInfo& info)
 {
 	need_update = true;
@@ -49,54 +59,24 @@ ImageManager::ImageManager(const DeviceInfo& info)
 	}
 
 	/* Set image limits */
-#define SET_TEX_IMAGES_LIMITS(ARCH) \
-	{ \
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_HALF4] = TEX_NUM_HALF4_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_BYTE] = TEX_NUM_BYTE_ ## ARCH; \
-		tex_num_images[IMAGE_DATA_TYPE_HALF] = TEX_NUM_HALF_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_START_FLOAT4_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_BYTE4] = TEX_START_BYTE4_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_HALF4] = TEX_START_HALF4_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT] = TEX_START_FLOAT_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_BYTE] = TEX_START_BYTE_ ## ARCH; \
-		tex_start_images[IMAGE_DATA_TYPE_HALF] = TEX_START_HALF_ ## ARCH; \
-	}
-
-	if(device_type == DEVICE_CPU) {
-		SET_TEX_IMAGES_LIMITS(CPU);
-	}
-	else if(device_type == DEVICE_CUDA) {
-		if(info.has_bindless_textures) {
-			SET_TEX_IMAGES_LIMITS(CUDA_KEPLER);
-		}
-		else {
-			SET_TEX_IMAGES_LIMITS(CUDA);
+	max_num_images = TEX_NUM_MAX;
+	has_half_images = true;
+	cuda_fermi_limits = false;
+
+	if(device_type == DEVICE_CUDA) {
+		if(!info.has_bindless_textures) {
+			/* CUDA Fermi hardware (SM 2.x) has a hard limit on the number of textures */
+			cuda_fermi_limits = true;
+			has_half_images = false;
 		}
 	}
 	else if(device_type == DEVICE_OPENCL) {
-		SET_TEX_IMAGES_LIMITS(OPENCL);
-	}
-	else {
-		/* Should not happen. */
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_BYTE4] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_HALF4] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_FLOAT] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_BYTE] = 0;
-		tex_num_images[IMAGE_DATA_TYPE_HALF] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_BYTE4] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_HALF4] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_FLOAT] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_BYTE] = 0;
-		tex_start_images[IMAGE_DATA_TYPE_HALF] = 0;
-		assert(0);
+		has_half_images = false;
 	}
 
-#undef SET_TEX_IMAGES_LIMITS
+	for(size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		tex_num_images[type] = 0;
+	}
 }
 
 ImageManager::~ImageManager()
@@ -133,18 +113,20 @@ bool ImageManager::set_animation_frame_update(int frame)
 	return false;
 }
 
-ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filename,
-                                                             void *builtin_data,
-                                                             bool& is_linear)
+ImageDataType ImageManager::get_image_metadata(const string& filename,
+                                               void *builtin_data,
+                                               bool& is_linear,
+                                               bool& builtin_free_cache)
 {
 	bool is_float = false, is_half = false;
 	is_linear = false;
+	builtin_free_cache = false;
 	int channels = 4;
 
 	if(builtin_data) {
 		if(builtin_image_info_cb) {
 			int width, height, depth;
-			builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels);
+			builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels, builtin_free_cache);
 		}
 
 		if(is_float) {
@@ -226,26 +208,28 @@ ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filen
 	}
 }
 
-/* We use a consecutive slot counting scheme on the devices, in order
- * float4, byte4, half4, float, byte, half.
+int ImageManager::max_flattened_slot(ImageDataType type)
+{
+	if(tex_num_images[type] == 0) {
+		/* No textures for the type, no slots needs allocation. */
+		return 0;
+	}
+	return type_index_to_flattened_slot(tex_num_images[type], type);
+}
+
+/* The lower three bits of a device texture slot number indicate its type.
  * These functions convert the slot ids from ImageManager "images" ones
- * to device ones and vice versa. */
+ * to device ones and vice verse.
+ */
 int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type)
 {
-	return slot + tex_start_images[type];
+	return (slot << IMAGE_DATA_TYPE_SHIFT) | (type);
 }
 
 int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type)
 {
-	for(int i = IMAGE_DATA_NUM_TYPES - 1; i >= 0; i--) {
-		if(flat_slot >= tex_start_images[i]) {
-			*type = (ImageDataType)i;
-			return flat_slot - tex_start_images[i];
-		}
-	}
-
-	/* Should not happen. */
-	return flat_slot;
+	*type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK);
+	return flat_slot >> IMAGE_DATA_TYPE_SHIFT;
 }
 
 string ImageManager::name_from_type(int type)
@@ -290,8 +274,9 @@ int ImageManager::add_image(const string& filename,
 {
 	Image *img;
 	size_t slot;
+	bool builtin_free_cache;
 
-	ImageDataType type = get_image_metadata(filename, builtin_data, is_linear);
+	ImageDataType type = get_image_metadata(filename, builtin_data, is_linear, builtin_free_cache);
 
 	thread_scoped_lock device_lock(device_mutex);
 
@@ -299,14 +284,22 @@ int ImageManager::add_image(const string& filename,
 	is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4);
 
 	/* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */
-	if((type == IMAGE_DATA_TYPE_FLOAT ||
-	    type == IMAGE_DATA_TYPE_HALF4 ||
-	    type == IMAGE_DATA_TYPE_HALF) &&
-	    tex_num_images[type] == 0) {
-		type = IMAGE_DATA_TYPE_FLOAT4;
+	if(!has_half_images) {
+		if(type == IMAGE_DATA_TYPE_HALF4) {
+			type = IMAGE_DATA_TYPE_FLOAT4;
+		}
+		else if(type == IMAGE_DATA_TYPE_HALF) {
+			type = IMAGE_DATA_TYPE_FLOAT;
+		}
 	}
-	if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0) {
-		type = IMAGE_DATA_TYPE_BYTE4;
+
+	if(cuda_fermi_limits) {
+		if(type == IMAGE_DATA_TYPE_FLOAT) {
+			type = IMAGE_DATA_TYPE_FLOAT4;
+		}
+		else if(type == IMAGE_DATA_TYPE_BYTE) {
+			type = IMAGE_DATA_TYPE_BYTE4;
+		}
 	}
 
 	/* Fnd existing image. */
@@ -338,14 +331,30 @@ int ImageManager::add_image(const string& filename,
 			break;
 	}
 
-	if(slot == images[type].size()) {
-		/* Max images limit reached. */
-		if(images[type].size() == tex_num_images[type]) {
+	/* Count if we're over the limit */
+	if(cuda_fermi_limits) {
+		if(tex_num_images[IMAGE_DATA_TYPE_BYTE4] == TEX_NUM_BYTE4_CUDA
+			|| tex_num_images[IMAGE_DATA_TYPE_FLOAT4] == TEX_NUM_FLOAT4_CUDA)
+		{
 			printf("ImageManager::add_image: Reached %s image limit (%d), skipping '%s'\n",
-			       name_from_type(type).c_str(), tex_num_images[type], filename.c_str());
+				name_from_type(type).c_str(), tex_num_images[type], filename.c_str());
 			return -1;
 		}
+	}
+	else {
+		/* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */
+		int tex_count = 0;
+		for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+			tex_count += tex_num_images[type];
+		}
+		if(tex_count > max_num_images) {
+			printf("ImageManager::add_image: Reached image limit (%d), skipping '%s'\n",
+				max_num_images, filename.c_str());
+			return -1;
+		}
+	}
 
+	if(slot == images[type].size()) {
 		images[type].resize(images[type].size() + 1);
 	}
 
@@ -353,6 +362,7 @@ int ImageManager::add_image(const string& filename,
 	img = new Image();
 	img->filename = filename;
 	img->builtin_data = builtin_data;
+	img->builtin_free_cache = builtin_free_cache;
 	img->need_load = true;
 	img->animated = animated;
 	img->frame = frame;
@@ -363,6 +373,8 @@ int ImageManager::add_image(const string& filename,
 
 	images[type][slot] = img;
 
+	++tex_num_images[type];
+
 	need_update = true;
 
 	return type_index_to_flattened_slot(slot, type);
@@ -436,7 +448,12 @@ void ImageManager::tag_reload_image(const string& filename,
 	}
 }
 
-bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components)
+bool ImageManager::file_load_image_generic(Image *img,
+                                           ImageInput **in,
+                                           int &width,
+                                           int &height,
+                                           int &depth,
+                                           int &components)
 {
 	if(img->filename == "")
 		return false;
@@ -475,8 +492,8 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 		if(!builtin_image_info_cb || !builtin_image_pixels_cb)
 			return false;
 
-		bool is_float;
-		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
+		bool is_float, free_cache;
+		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components, free_cache);
 	}
 
 	/* we only handle certain number of components */
@@ -557,13 +574,15 @@ bool ImageManager::file_load_image(Image *img,
 			builtin_image_float_pixels_cb(img->filename,
 			                              img->builtin_data,
 			                              (float*)&pixels[0],
-			                              num_pixels * components);
+			                              num_pixels * components,
+			                              img->builtin_free_cache);
 		}
 		else if(FileFormat == TypeDesc::UINT8) {
 			builtin_image_pixels_cb(img->filename,
 			                        img->builtin_data,
 			                        (uchar*)&pixels[0],
-			                        num_pixels * components);
+			                        num_pixels * components,
+			                        img->builtin_free_cache);
 		}
 		else {
 			/* TODO(dingto): Support half for ImBuf. */
@@ -618,6 +637,37 @@ bool ImageManager::file_load_image(Image *img,
 			}
 		}
 	}
+	/* Make sure we don't have buggy values. */
+	if(FileFormat == TypeDesc::FLOAT) {
+		/* For RGBA buffers we put all channels to 0 if either of them is not
+		 * finite. This way we avoid possible artifacts caused by fully changed
+		 * hue.
+		 */
+		if(is_rgba) {
+			for(size_t i = 0; i < num_pixels; i += 4) {
+				StorageType *pixel = &pixels[i*4];
+				if(!isfinite(pixel[0]) ||
+				   !isfinite(pixel[1]) ||
+				   !isfinite(pixel[2]) ||
+				   !isfinite(pixel[3]))
+				{
+					pixel[0] = 0;
+					pixel[1] = 0;
+					pixel[2] = 0;
+					pixel[3] = 0;
+				}
+			}
+		}
+		else {
+			for(size_t i = 0; i < num_pixels; ++i) {
+				StorageType *pixel = &pixels[i];
+				if(!isfinite(pixel[0])) {
+					pixel[0] = 0;
+				}
+			}
+		}
+	}
+	/* Scale image down if needed. */
 	if(pixels_storage.size() > 0) {
 		float scale_factor = 1.0f;
 		while(max_size * scale_factor > texture_limit) {
@@ -666,16 +716,12 @@ void ImageManager::device_load_image(Device *device,
 	/* Slot assignment */
 	int flat_slot = type_index_to_flattened_slot(slot, type);
 
-	string name;
-	if(flat_slot >= 100)
-		name = string_printf("__tex_image_%s_%d", name_from_type(type).c_str(), flat_slot);
-	else if(flat_slot >= 10)
-		name = string_printf("__tex_image_%s_0%d", name_from_type(type).c_str(), flat_slot);
-	else
-		name = string_printf("__tex_image_%s_00%d", name_from_type(type).c_str(), flat_slot);
+	string name = string_printf("__tex_image_%s_%03d", name_from_type(type).c_str(), flat_slot);
 
 	if(type == IMAGE_DATA_TYPE_FLOAT4) {
-		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
+		if(dscene->tex_float4_image[slot] == NULL)
+			dscene->tex_float4_image[slot] = new device_vector<float4>();
+		device_vector<float4>& tex_img = *dscene->tex_float4_image[slot];
 
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -705,7 +751,9 @@ void ImageManager::device_load_image(Device *device,
 		}
 	}
 	else if(type == IMAGE_DATA_TYPE_FLOAT) {
-		device_vector<float>& tex_img = dscene->tex_float_image[slot];
+		if(dscene->tex_float_image[slot] == NULL)
+			dscene->tex_float_image[slot] = new device_vector<float>();
+		device_vector<float>& tex_img = *dscene->tex_float_image[slot];
 
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -732,7 +780,9 @@ void ImageManager::device_load_image(Device *device,
 		}
 	}
 	else if(type == IMAGE_DATA_TYPE_BYTE4) {
-		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
+		if(dscene->tex_byte4_image[slot] == NULL)
+			dscene->tex_byte4_image[slot] = new device_vector<uchar4>();
+		device_vector<uchar4>& tex_img = *dscene->tex_byte4_image[slot];
 
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -762,7 +812,9 @@ void ImageManager::device_load_image(Device *device,
 		}
 	}
 	else if(type == IMAGE_DATA_TYPE_BYTE){
-		device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
+		if(dscene->tex_byte_image[slot] == NULL)
+			dscene->tex_byte_image[slot] = new device_vector<uchar>();
+		device_vector<uchar>& tex_img = *dscene->tex_byte_image[slot];
 
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -788,7 +840,9 @@ void ImageManager::device_load_image(Device *device,
 		}
 	}
 	else if(type == IMAGE_DATA_TYPE_HALF4){
-		device_vector<half4>& tex_img = dscene->tex_half4_image[slot];
+		if(dscene->tex_half4_image[slot] == NULL)
+			dscene->tex_half4_image[slot] = new device_vector<half4>();
+		device_vector<half4>& tex_img = *dscene->tex_half4_image[slot];
 
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -817,7 +871,9 @@ void ImageManager::device_load_image(Device *device,
 		}
 	}
 	else if(type == IMAGE_DATA_TYPE_HALF){
-		device_vector<half>& tex_img = dscene->tex_half_image[slot];
+		if(dscene->tex_half_image[slot] == NULL)
+			dscene->tex_half_image[slot] = new device_vector<half>();
+		device_vector<half>& tex_img = *dscene->tex_half_image[slot];
 
 		if(tex_img.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -857,69 +913,100 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageD
 			((OSL::TextureSystem*)osl_texture_system)->invalidate(filename);
 #endif
 		}
-		else if(type == IMAGE_DATA_TYPE_FLOAT4) {
-			device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_FLOAT) {
-			device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_BYTE4) {
-			device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_BYTE){
-			device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
-			}
-
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_HALF4){
-			device_vector<half4>& tex_img = dscene->tex_half4_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
+		else {
+			device_memory *tex_img = NULL;
+			switch(type) {
+				case IMAGE_DATA_TYPE_FLOAT4:
+					if(slot >= dscene->tex_float4_image.size()) {
+						break;
+					}
+					tex_img = dscene->tex_float4_image[slot];
+					dscene->tex_float4_image[slot] = NULL;
+					break;
+				case IMAGE_DATA_TYPE_BYTE4:
+					if(slot >= dscene->tex_byte4_image.size()) {
+						break;
+					}
+					tex_img = dscene->tex_byte4_image[slot];
+					dscene->tex_byte4_image[slot]= NULL;
+					break;
+				case IMAGE_DATA_TYPE_HALF4:
+					if(slot >= dscene->tex_half4_image.size()) {
+						break;
+					}
+					tex_img = dscene->tex_half4_image[slot];
+					dscene->tex_half4_image[slot]= NULL;
+					break;
+				case IMAGE_DATA_TYPE_FLOAT:
+					if(slot >= dscene->tex_float_image.size()) {
+						break;
+					}
+					tex_img = dscene->tex_float_image[slot];
+					dscene->tex_float_image[slot] = NULL;
+					break;
+				case IMAGE_DATA_TYPE_BYTE:
+					if(slot >= dscene->tex_byte_image.size()) {
+						break;
+					}
+					tex_img = dscene->tex_byte_image[slot];
+					dscene->tex_byte_image[slot]= NULL;
+					break;
+				case IMAGE_DATA_TYPE_HALF:
+					if(slot >= dscene->tex_half_image.size()) {
+						break;
+					}
+					tex_img = dscene->tex_half_image[slot];
+					dscene->tex_half_image[slot]= NULL;
+					break;
+				default:
+					assert(0);
+					tex_img = NULL;
 			}
+			if(tex_img) {
+				if(tex_img->device_pointer) {
+					thread_scoped_lock device_lock(device_mutex);
+					device->tex_free(*tex_img);
+				}
 
-			tex_img.clear();
-		}
-		else if(type == IMAGE_DATA_TYPE_HALF){
-			device_vector<half>& tex_img = dscene->tex_half_image[slot];
-
-			if(tex_img.device_pointer) {
-				thread_scoped_lock device_lock(device_mutex);
-				device->tex_free(tex_img);
+				delete tex_img;
 			}
-
-			tex_img.clear();
 		}
 
 		delete images[type][slot];
 		images[type][slot] = NULL;
+		--tex_num_images[type];
+	}
+}
+
+void ImageManager::device_prepare_update(DeviceScene *dscene)
+{
+	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		switch(type) {
+			case IMAGE_DATA_TYPE_FLOAT4:
+				if(dscene->tex_float4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_FLOAT4])
+					dscene->tex_float4_image.resize(tex_num_images[IMAGE_DATA_TYPE_FLOAT4]);
+				break;
+			case IMAGE_DATA_TYPE_BYTE4:
+				if(dscene->tex_byte4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_BYTE4])
+					dscene->tex_byte4_image.resize(tex_num_images[IMAGE_DATA_TYPE_BYTE4]);
+				break;
+			case IMAGE_DATA_TYPE_HALF4:
+				if(dscene->tex_half4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_HALF4])
+					dscene->tex_half4_image.resize(tex_num_images[IMAGE_DATA_TYPE_HALF4]);
+				break;
+			case IMAGE_DATA_TYPE_BYTE:
+				if(dscene->tex_byte_image.size() <= tex_num_images[IMAGE_DATA_TYPE_BYTE])
+					dscene->tex_byte_image.resize(tex_num_images[IMAGE_DATA_TYPE_BYTE]);
+				break;
+			case IMAGE_DATA_TYPE_FLOAT:
+				if(dscene->tex_float_image.size() <= tex_num_images[IMAGE_DATA_TYPE_FLOAT])
+					dscene->tex_float_image.resize(tex_num_images[IMAGE_DATA_TYPE_FLOAT]);
+				break;
+			case IMAGE_DATA_TYPE_HALF:
+				if(dscene->tex_half_image.size() <= tex_num_images[IMAGE_DATA_TYPE_HALF])
+					dscene->tex_half_image.resize(tex_num_images[IMAGE_DATA_TYPE_HALF]);
+				break;
+		}
 	}
 }
 
@@ -928,11 +1015,14 @@ void ImageManager::device_update(Device *device,
                                  Scene *scene,
                                  Progress& progress)
 {
-	if(!need_update)
+	if(!need_update) {
 		return;
+	}
 
-	TaskPool pool;
+	/* Make sure arrays are proper size. */
+	device_prepare_update(dscene);
 
+	TaskPool pool;
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
 		for(size_t slot = 0; slot < images[type].size(); slot++) {
 			if(!images[type][slot])
@@ -992,159 +1082,101 @@ void ImageManager::device_update_slot(Device *device,
 uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
 {
 	uint8_t options = 0;
-
 	/* Image Options are packed into one uint:
 	 * bit 0 -> Interpolation
-	 * bit 1 + 2  + 3-> Extension */
-	if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST)
+	 * bit 1 + 2 + 3 -> Extension
+	 */
+	if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) {
 		options |= (1 << 0);
-
-	if(images[type][slot]->extension == EXTENSION_REPEAT)
+	}
+	if(images[type][slot]->extension == EXTENSION_REPEAT) {
 		options |= (1 << 1);
-	else if(images[type][slot]->extension == EXTENSION_EXTEND)
+	}
+	else if(images[type][slot]->extension == EXTENSION_EXTEND) {
 		options |= (1 << 2);
-	else /* EXTENSION_CLIP */
+	}
+	else /* EXTENSION_CLIP */ {
 		options |= (1 << 3);
-
+	}
 	return options;
 }
 
-void ImageManager::device_pack_images(Device *device,
-                                      DeviceScene *dscene,
-                                      Progress& /*progess*/)
+template<typename T>
+void ImageManager::device_pack_images_type(
+        ImageDataType type,
+        const vector<device_vector<T>*>& cpu_textures,
+        device_vector<T> *device_image,
+        uint4 *info)
 {
-	/* For OpenCL, we pack all image textures into a single large texture, and
-	 * do our own interpolation in the kernel. */
 	size_t size = 0, offset = 0;
-	ImageDataType type;
-
-	int info_size = tex_num_images[IMAGE_DATA_TYPE_FLOAT4] + tex_num_images[IMAGE_DATA_TYPE_BYTE4]
-	                + tex_num_images[IMAGE_DATA_TYPE_FLOAT] + tex_num_images[IMAGE_DATA_TYPE_BYTE];
-	uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
-
-	/* Byte4 Textures*/
-	type = IMAGE_DATA_TYPE_BYTE4;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-		size += tex_img.size();
-	}
-
-	uchar4 *pixels_byte4 = dscene->tex_image_byte4_packed.resize(size);
-
+	/* First step is to calculate size of the texture we need. */
 	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_byte4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
-
-	/* Float4 Textures*/
-	type = IMAGE_DATA_TYPE_FLOAT4;
-	size = 0, offset = 0;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-		size += tex_img.size();
-	}
-
-	float4 *pixels_float4 = dscene->tex_image_float4_packed.resize(size);
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float4>& tex_img = dscene->tex_float4_image[slot];
-
-		/* todo: support 3D textures, only CPU for now */
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_float4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
-
-	/* Byte Textures*/
-	type = IMAGE_DATA_TYPE_BYTE;
-	size = 0, offset = 0;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
+		if(images[type][slot] == NULL) {
 			continue;
-
-		device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
+		}
+		device_vector<T>& tex_img = *cpu_textures[slot];
 		size += tex_img.size();
 	}
-
-	uchar *pixels_byte = dscene->tex_image_byte_packed.resize(size);
-
+	/* Now we know how much memory we need, so we can allocate and fill. */
+	T *pixels = device_image->resize(size);
 	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
+		if(images[type][slot] == NULL) {
 			continue;
-
-		device_vector<uchar>& tex_img = dscene->tex_byte_image[slot];
-
+		}
+		device_vector<T>& tex_img = *cpu_textures[slot];
 		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		const int index = type_index_to_flattened_slot(slot, type) * 2;
+		info[index] = make_uint4(tex_img.data_width,
+		                         tex_img.data_height,
+		                         offset,
+		                         options);
 		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-
-		memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
+		memcpy(pixels + offset,
+		       (void*)tex_img.data_pointer,
+		       tex_img.memory_size());
 		offset += tex_img.size();
 	}
+}
 
-	/* Float Textures*/
-	type = IMAGE_DATA_TYPE_FLOAT;
-	size = 0, offset = 0;
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float>& tex_img = dscene->tex_float_image[slot];
-		size += tex_img.size();
-	}
-
-	float *pixels_float = dscene->tex_image_float_packed.resize(size);
-
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(!images[type][slot])
-			continue;
-
-		device_vector<float>& tex_img = dscene->tex_float_image[slot];
-
-		/* todo: support 3D textures, only CPU for now */
-
-		uint8_t options = pack_image_options(type, slot);
-
-		int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
+void ImageManager::device_pack_images(Device *device,
+                                      DeviceScene *dscene,
+                                      Progress& /*progess*/)
+{
+	/* For OpenCL, we pack all image textures into a single large texture, and
+	 * do our own interpolation in the kernel.
+	 */
 
-		memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
-		offset += tex_img.size();
-	}
+	/* TODO(sergey): This will over-allocate a bit, but this is constant memory
+	 * so should be fine for a short term.
+	 */
+	const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4),
+	                              max_flattened_slot(IMAGE_DATA_TYPE_BYTE4),
+	                              max_flattened_slot(IMAGE_DATA_TYPE_FLOAT),
+	                              max_flattened_slot(IMAGE_DATA_TYPE_BYTE));
+	uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
 
+	/* Pack byte4 textures. */
+	device_pack_images_type(IMAGE_DATA_TYPE_BYTE4,
+	                        dscene->tex_byte4_image,
+	                        &dscene->tex_image_byte4_packed,
+	                        info);
+	/* Pack float4 textures. */
+	device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4,
+	                        dscene->tex_float4_image,
+	                        &dscene->tex_image_float4_packed,
+	                        info);
+	/* Pack byte textures. */
+	device_pack_images_type(IMAGE_DATA_TYPE_BYTE,
+	                        dscene->tex_byte_image,
+	                        &dscene->tex_image_byte_packed,
+	                        info);
+	/* Pack float textures. */
+	device_pack_images_type(IMAGE_DATA_TYPE_FLOAT,
+	                        dscene->tex_float_image,
+	                        &dscene->tex_image_float_packed,
+	                        info);
+
+	/* Push textures to the device. */
 	if(dscene->tex_image_byte4_packed.size()) {
 		if(dscene->tex_image_byte4_packed.device_pointer) {
 			thread_scoped_lock device_lock(device_mutex);
@@ -1201,16 +1233,23 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene)
 		images[type].clear();
 	}
 
-	device->tex_free(dscene->tex_image_byte4_packed);
+	dscene->tex_float4_image.clear();
+	dscene->tex_byte4_image.clear();
+	dscene->tex_half4_image.clear();
+	dscene->tex_float_image.clear();
+	dscene->tex_byte_image.clear();
+	dscene->tex_half_image.clear();
+
 	device->tex_free(dscene->tex_image_float4_packed);
-	device->tex_free(dscene->tex_image_byte_packed);
+	device->tex_free(dscene->tex_image_byte4_packed);
 	device->tex_free(dscene->tex_image_float_packed);
+	device->tex_free(dscene->tex_image_byte_packed);
 	device->tex_free(dscene->tex_image_packed_info);
 
-	dscene->tex_image_byte4_packed.clear();
 	dscene->tex_image_float4_packed.clear();
-	dscene->tex_image_byte_packed.clear();
+	dscene->tex_image_byte4_packed.clear();
 	dscene->tex_image_float_packed.clear();
+	dscene->tex_image_byte_packed.clear();
 	dscene->tex_image_packed_info.clear();
 }
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 996b5a5b65f..db7e28a5e44 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -37,17 +37,6 @@ public:
 	explicit ImageManager(const DeviceInfo& info);
 	~ImageManager();
 
-	enum ImageDataType {
-		IMAGE_DATA_TYPE_FLOAT4 = 0,
-		IMAGE_DATA_TYPE_BYTE4 = 1,
-		IMAGE_DATA_TYPE_HALF4 = 2,
-		IMAGE_DATA_TYPE_FLOAT = 3,
-		IMAGE_DATA_TYPE_BYTE = 4,
-		IMAGE_DATA_TYPE_HALF = 5,
-
-		IMAGE_DATA_NUM_TYPES
-	};
-
 	int add_image(const string& filename,
 	              void *builtin_data,
 	              bool animated,
@@ -68,8 +57,12 @@ public:
 	                      InterpolationType interpolation,
 	                      ExtensionType extension,
 	                      bool use_alpha);
-	ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear);
+	ImageDataType get_image_metadata(const string& filename,
+	                                 void *builtin_data,
+	                                 bool& is_linear,
+	                                 bool& builtin_free_cache);
 
+	void device_prepare_update(DeviceScene *dscene);
 	void device_update(Device *device,
 	                   DeviceScene *dscene,
 	                   Scene *scene,
@@ -98,19 +91,23 @@ public:
 	              int &width,
 	              int &height,
 	              int &depth,
-	              int &channels)> builtin_image_info_cb;
+	              int &channels,
+	              bool &free_cache)> builtin_image_info_cb;
 	function<bool(const string &filename,
 	              void *data,
 	              unsigned char *pixels,
-	              const size_t pixels_size)> builtin_image_pixels_cb;
+	              const size_t pixels_size,
+	              const bool free_cache)> builtin_image_pixels_cb;
 	function<bool(const string &filename,
 	              void *data,
 	              float *pixels,
-	              const size_t pixels_size)> builtin_image_float_pixels_cb;
+	              const size_t pixels_size,
+	              const bool free_cache)> builtin_image_float_pixels_cb;
 
 	struct Image {
 		string filename;
 		void *builtin_data;
+		bool builtin_free_cache;
 
 		bool use_alpha;
 		bool need_load;
@@ -124,7 +121,9 @@ public:
 
 private:
 	int tex_num_images[IMAGE_DATA_NUM_TYPES];
-	int tex_start_images[IMAGE_DATA_NUM_TYPES];
+	int max_num_images;
+	bool has_half_images;
+	bool cuda_fermi_limits;
 
 	thread_mutex device_mutex;
 	int animation_frame;
@@ -133,7 +132,12 @@ private:
 	void *osl_texture_system;
 	bool pack_images;
 
-	bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components);
+	bool file_load_image_generic(Image *img,
+	                             ImageInput **in,
+	                             int &width,
+	                             int &height,
+	                             int &depth,
+	                             int &components);
 
 	template<TypeDesc::BASETYPE FileFormat,
 	         typename StorageType,
@@ -143,6 +147,7 @@ private:
 	                     int texture_limit,
 	                     device_vector<DeviceType>& tex_img);
 
+	int max_flattened_slot(ImageDataType type);
 	int type_index_to_flattened_slot(int slot, ImageDataType type);
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
 	string name_from_type(int type);
@@ -160,6 +165,13 @@ private:
 	                       ImageDataType type,
 	                       int slot);
 
+	template<typename T>
+	void device_pack_images_type(
+	        ImageDataType type,
+	        const vector<device_vector<T>*>& cpu_textures,
+	        device_vector<T> *device_image,
+	        uint4 *info);
+
 	void device_pack_images(Device *device,
 	                        DeviceScene *dscene,
 	                        Progress& progess);
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 4886dcd563f..93d88c5642c 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -224,6 +224,10 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 
 bool LightManager::object_usable_as_light(Object *object) {
 	Mesh *mesh = object->mesh;
+	/* Skip objects with NaNs */
+	if (!object->bounds.valid()) {
+		return false;
+	}
 	/* Skip if we are not visible for BSDFs. */
 	if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
 		return false;
@@ -486,18 +490,10 @@ static void background_cdf(int start,
                            float2 *cond_cdf)
 {
 	/* Conditional CDFs (rows, U direction). */
-	/* NOTE: It is possible to have some NaN pixels on background
-	 * which will ruin CDF causing wrong shading. We replace such
-	 * pixels with black.
-	 */
 	for(int i = start; i < end; i++) {
 		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
 		float3 env_color = (*pixels)[i * res];
 		float ave_luminance = average(env_color);
-		/* TODO(sergey): Consider adding average_safe(). */
-		if(!isfinite(ave_luminance)) {
-			ave_luminance = 0.0f;
-		}
 
 		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
 		cond_cdf[i * cdf_count].y = 0.0f;
@@ -505,9 +501,6 @@ static void background_cdf(int start,
 		for(int j = 1; j < res; j++) {
 			env_color = (*pixels)[i * res + j];
 			ave_luminance = average(env_color);
-			if(!isfinite(ave_luminance)) {
-				ave_luminance = 0.0f;
-			}
 
 			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
 			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index a4dc06c4345..03825f780e0 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -903,7 +903,7 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
 		float3 vNi = vN[i];
 
 		if(do_transform)
-			vNi = normalize(transform_direction(&ntfm, vNi));
+			vNi = safe_normalize(transform_direction(&ntfm, vNi));
 
 		vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
 	}
@@ -1944,6 +1944,7 @@ void MeshManager::device_update_displacement_images(Device *device,
 			}
 		}
 	}
+	image_manager->device_prepare_update(dscene);
 	foreach(int slot, bump_images) {
 		pool.push(function_bind(&ImageManager::device_update_slot,
 		                        image_manager,
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index cf28bb16bb7..4ca20cf7ef3 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -169,6 +169,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 			if(!done[t.v[j]]) {
 				done[t.v[j]] = true;
 				float3 off = float4_to_float3(offset[k++]);
+				/* Avoid illegal vertex coordinates. */
+				off = ensure_finite3(off);
 				mesh->verts[t.v[j]] += off;
 				if(attr_mP != NULL) {
 					for(int step = 0; step < mesh->motion_steps - 1; step++) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 1070e05a03b..90a68a06cb5 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -364,9 +364,10 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
 		if(builtin_data == NULL) {
-			ImageManager::ImageDataType type;
-			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
-			if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4)
+			ImageDataType type;
+			bool builtin_free_cache;
+			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache);
+			if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
 				is_float = 1;
 		}
 		else {
@@ -553,9 +554,10 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler)
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
 		if(builtin_data == NULL) {
-			ImageManager::ImageDataType type;
-			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
-			if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4)
+			ImageDataType type;
+			bool builtin_free_cache;
+			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache);
+			if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
 				is_float = 1;
 		}
 		else {
@@ -1791,12 +1793,19 @@ void ConvertNode::compile(OSLCompiler& compiler)
 		assert(0);
 }
 
+/* Base type for all closure-type nodes */
+
+BsdfBaseNode::BsdfBaseNode(const NodeType *node_type)
+        : ShaderNode(node_type)
+{
+	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
+}
+
 /* BSDF Closure */
 
 BsdfNode::BsdfNode(const NodeType *node_type)
-: ShaderNode(node_type)
+: BsdfBaseNode(node_type)
 {
-	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
 }
 
 void BsdfNode::compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3, ShaderInput *param4)
@@ -2286,6 +2295,155 @@ void DiffuseBsdfNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_diffuse_bsdf");
 }
 
+/* Disney principled BSDF Closure */
+NODE_DEFINE(PrincipledBsdfNode)
+{
+	NodeType* type = NodeType::add("principled_bsdf", create, NodeType::SHADER);
+
+	static NodeEnum distribution_enum;
+	distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID);
+	distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
+	SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
+	SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
+	SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
+	SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
+	SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
+	SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+	SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
+	SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
+	SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
+	SOCKET_IN_FLOAT(anisotropic, "Anisotropic", 0.0f);
+	SOCKET_IN_FLOAT(sheen, "Sheen", 0.0f);
+	SOCKET_IN_FLOAT(sheen_tint, "Sheen Tint", 0.0f);
+	SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f);
+	SOCKET_IN_FLOAT(clearcoat_roughness, "Clearcoat Roughness", 0.03f);
+	SOCKET_IN_FLOAT(ior, "IOR", 0.0f);
+	SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f);
+	SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f);
+	SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f);
+	SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+	SOCKET_IN_NORMAL(clearcoat_normal, "Clearcoat Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
+	SOCKET_IN_NORMAL(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT);
+	SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
+
+	SOCKET_OUT_CLOSURE(BSDF, "BSDF");
+
+	return type;
+}
+
+PrincipledBsdfNode::PrincipledBsdfNode()
+	: BsdfBaseNode(node_type)
+{
+	closure = CLOSURE_BSDF_PRINCIPLED_ID;
+	distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+	distribution_orig = NBUILTIN_CLOSURES;
+}
+
+bool PrincipledBsdfNode::has_surface_bssrdf()
+{
+	ShaderInput *subsurface_in = input("Subsurface");
+	return (subsurface_in->link != NULL || subsurface > CLOSURE_WEIGHT_CUTOFF);
+}
+
+void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+	if(shader->has_surface) {
+		ShaderInput *tangent_in = input("Tangent");
+
+		if(!tangent_in->link)
+			attributes->add(ATTR_STD_GENERATED);
+	}
+
+	ShaderNode::attributes(shader, attributes);
+}
+
+void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius,
+	ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic,
+	ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_roughness,
+	ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness)
+{
+	ShaderInput *base_color_in = input("Base Color");
+	ShaderInput *subsurface_color_in = input("Subsurface Color");
+	ShaderInput *normal_in = input("Normal");
+	ShaderInput *clearcoat_normal_in = input("Clearcoat Normal");
+	ShaderInput *tangent_in = input("Tangent");
+
+	float3 weight = make_float3(1.0f, 1.0f, 1.0f);
+
+	compiler.add_node(NODE_CLOSURE_SET_WEIGHT, weight);
+
+	int normal_offset = compiler.stack_assign_if_linked(normal_in);
+	int clearcoat_normal_offset = compiler.stack_assign_if_linked(clearcoat_normal_in);
+	int tangent_offset = compiler.stack_assign_if_linked(tangent_in);
+	int specular_offset = compiler.stack_assign(p_specular);
+	int roughness_offset = compiler.stack_assign(p_roughness);
+	int specular_tint_offset = compiler.stack_assign(p_specular_tint);
+	int anisotropic_offset = compiler.stack_assign(p_anisotropic);
+	int sheen_offset = compiler.stack_assign(p_sheen);
+	int sheen_tint_offset = compiler.stack_assign(p_sheen_tint);
+	int clearcoat_offset = compiler.stack_assign(p_clearcoat);
+	int clearcoat_roughness_offset = compiler.stack_assign(p_clearcoat_roughness);
+	int ior_offset = compiler.stack_assign(p_ior);
+	int transmission_offset = compiler.stack_assign(p_transmission);
+	int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
+	int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
+	int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+
+	compiler.add_node(NODE_CLOSURE_BSDF,
+		compiler.encode_uchar4(closure,
+		compiler.stack_assign(p_metallic),
+		compiler.stack_assign(p_subsurface),
+		compiler.closure_mix_weight_offset()),
+		__float_as_int((p_metallic) ? get_float(p_metallic->socket_type) : 0.0f),
+		__float_as_int((p_subsurface) ? get_float(p_subsurface->socket_type) : 0.0f));
+
+	compiler.add_node(normal_offset, tangent_offset,
+		compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset),
+		compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset));
+
+	compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset),
+		distribution, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
+	float3 bc_default = get_float3(base_color_in->socket_type);
+
+	compiler.add_node(((base_color_in->link) ? compiler.stack_assign(base_color_in) : SVM_STACK_INVALID),
+		__float_as_int(bc_default.x), __float_as_int(bc_default.y), __float_as_int(bc_default.z));
+
+	compiler.add_node(clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
+	float3 ss_default = get_float3(subsurface_color_in->socket_type);
+
+	compiler.add_node(((subsurface_color_in->link) ? compiler.stack_assign(subsurface_color_in) : SVM_STACK_INVALID),
+		__float_as_int(ss_default.x), __float_as_int(ss_default.y), __float_as_int(ss_default.z));
+}
+
+bool PrincipledBsdfNode::has_integrator_dependency()
+{
+	ShaderInput *roughness_input = input("Roughness");
+	return !roughness_input->link && roughness <= 1e-4f;
+}
+
+void PrincipledBsdfNode::compile(SVMCompiler& compiler)
+{
+	compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"),
+		input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"),
+		input("Clearcoat"), input("Clearcoat Roughness"), input("IOR"), input("Transmission"),
+		input("Anisotropic Rotation"), input("Transmission Roughness"));
+}
+
+void PrincipledBsdfNode::compile(OSLCompiler& compiler)
+{
+	compiler.parameter(this, "distribution");
+	compiler.add(this, "node_principled_bsdf");
+}
+
+bool PrincipledBsdfNode::has_bssrdf_bump()
+{
+	/* detect if anything is plugged into the normal input besides the default */
+	ShaderInput *normal_in = input("Normal");
+	return (normal_in->link && normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY);
+}
+
 /* Translucent BSDF Closure */
 
 NODE_DEFINE(TranslucentBsdfNode)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index a755b653a5b..c0271a3c8eb 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -252,6 +252,7 @@ public:
 class PointDensityTextureNode : public ShaderNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	~PointDensityTextureNode();
 	ShaderNode *clone() const;
@@ -321,7 +322,14 @@ private:
 	static bool initialized;
 };
 
-class BsdfNode : public ShaderNode {
+class BsdfBaseNode : public ShaderNode {
+public:
+	BsdfBaseNode(const NodeType *node_type);
+
+	ClosureType closure;
+};
+
+class BsdfNode : public BsdfBaseNode {
 public:
 	explicit BsdfNode(const NodeType *node_type);
 	SHADER_NODE_BASE_CLASS(BsdfNode)
@@ -333,7 +341,6 @@ public:
 	float3 color;
 	float3 normal;
 	float surface_mix_weight;
-	ClosureType closure;
 
 	virtual bool equals(const ShaderNode& /*other*/)
 	{
@@ -361,6 +368,39 @@ public:
 	float roughness;
 };
 
+/* Disney principled BRDF */
+class PrincipledBsdfNode : public BsdfBaseNode {
+public:
+	SHADER_NODE_CLASS(PrincipledBsdfNode)
+
+	bool has_spatial_varying() { return true; }
+	bool has_surface_bssrdf();
+	bool has_bssrdf_bump();
+	void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius,
+		ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic,
+		ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_roughness,
+		ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness);
+
+	float3 base_color;
+	float3 subsurface_color, subsurface_radius;
+	float metallic, subsurface, specular, roughness, specular_tint, anisotropic,
+		sheen, sheen_tint, clearcoat, clearcoat_roughness, ior, transmission,
+		anisotropic_rotation, transmission_roughness;
+	float3 normal, clearcoat_normal, tangent;
+	float surface_mix_weight;
+	ClosureType distribution, distribution_orig;
+
+	virtual bool equals(const ShaderNode * /*other*/)
+	{
+		/* TODO(sergey): With some care BSDF nodes can be de-duplicated. */
+		return false;
+	}
+
+	ClosureType get_closure_type() { return closure; }
+	bool has_integrator_dependency();
+	void attributes(Shader *shader, AttributeRequestSet *attributes);
+};
+
 class TranslucentBsdfNode : public BsdfNode {
 public:
 	SHADER_NODE_CLASS(TranslucentBsdfNode)
@@ -445,6 +485,7 @@ public:
 	virtual ClosureType get_closure_type() { return CLOSURE_EMISSION_ID; }
 
 	bool has_surface_emission() { return true; }
+	bool has_volume_support() { return true; }
 
 	float3 color;
 	float strength;
@@ -496,6 +537,7 @@ public:
 		return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
 	}
 	virtual ClosureType get_closure_type() { return closure; }
+	virtual bool has_volume_support() { return true; }
 
 	float3 color;
 	float density;
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 6bff29d1c76..a794f233718 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -156,6 +156,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 	og->surface_state.clear();
 	og->volume_state.clear();
 	og->displacement_state.clear();
+	og->bump_state.clear();
 	og->background_state.reset();
 }
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 2b5267642a2..4c2c4f5fcc3 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -114,18 +114,18 @@ public:
 	device_vector<uint> sobol_directions;
 
 	/* cpu images */
-	device_vector<uchar4> tex_byte4_image[TEX_NUM_BYTE4_CPU];
-	device_vector<float4> tex_float4_image[TEX_NUM_FLOAT4_CPU];
-	device_vector<float> tex_float_image[TEX_NUM_FLOAT_CPU];
-	device_vector<uchar> tex_byte_image[TEX_NUM_BYTE_CPU];
-	device_vector<half4> tex_half4_image[TEX_NUM_HALF4_CPU];
-	device_vector<half> tex_half_image[TEX_NUM_HALF_CPU];
+	vector<device_vector<float4>* > tex_float4_image;
+	vector<device_vector<uchar4>* > tex_byte4_image;
+	vector<device_vector<half4>* > tex_half4_image;
+	vector<device_vector<float>* > tex_float_image;
+	vector<device_vector<uchar>* > tex_byte_image;
+	vector<device_vector<half>* > tex_half_image;
 
 	/* opencl images */
-	device_vector<uchar4> tex_image_byte4_packed;
 	device_vector<float4> tex_image_float4_packed;
-	device_vector<uchar> tex_image_byte_packed;
+	device_vector<uchar4> tex_image_byte4_packed;
 	device_vector<float> tex_image_float_packed;
+	device_vector<uchar> tex_image_byte_packed;
 	device_vector<uint4> tex_image_packed_info;
 
 	KernelData data;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c9b5547b407..8622318858e 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -114,8 +114,9 @@ Session::~Session()
 	}
 
 	/* clean up */
-	foreach(RenderBuffers *buffers, tile_buffers)
-		delete buffers;
+	foreach(RenderTile &rtile, render_tiles)
+		delete rtile.buffers;
+	tile_manager.free_device();
 
 	delete buffers;
 	delete display;
@@ -268,8 +269,8 @@ void Session::run_gpu()
 			/* update status and timing */
 			update_status_time();
 
-			/* path trace */
-			path_trace();
+			/* render */
+			render();
 
 			device->task_wait();
 
@@ -358,20 +359,22 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 	thread_scoped_lock tile_lock(tile_mutex);
 
 	/* get next tile from manager */
-	Tile tile;
+	Tile *tile;
 	int device_num = device->device_number(tile_device);
 
 	if(!tile_manager.next_tile(tile, device_num))
 		return false;
 	
 	/* fill render tile */
-	rtile.x = tile_manager.state.buffer.full_x + tile.x;
-	rtile.y = tile_manager.state.buffer.full_y + tile.y;
-	rtile.w = tile.w;
-	rtile.h = tile.h;
+	rtile.x = tile_manager.state.buffer.full_x + tile->x;
+	rtile.y = tile_manager.state.buffer.full_y + tile->y;
+	rtile.w = tile->w;
+	rtile.h = tile->h;
 	rtile.start_sample = tile_manager.state.sample;
 	rtile.num_samples = tile_manager.state.num_samples;
 	rtile.resolution = tile_manager.state.resolution_divider;
+	rtile.tile_index = tile->index;
+	rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
 
 	tile_lock.unlock();
 
@@ -383,54 +386,70 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 		rtile.buffer = buffers->buffer.device_pointer;
 		rtile.rng_state = buffers->rng_state.device_pointer;
 		rtile.buffers = buffers;
+		tile->buffers = buffers;
 
 		device->map_tile(tile_device, rtile);
 
 		return true;
 	}
 
-	/* fill buffer parameters */
-	BufferParams buffer_params = tile_manager.params;
-	buffer_params.full_x = rtile.x;
-	buffer_params.full_y = rtile.y;
-	buffer_params.width = rtile.w;
-	buffer_params.height = rtile.h;
-
-	buffer_params.get_offset_stride(rtile.offset, rtile.stride);
-
-	RenderBuffers *tilebuffers;
+	bool store_rtile = false;
+	if(tile->buffers == NULL) {
+		/* fill buffer parameters */
+		BufferParams buffer_params = tile_manager.params;
+		buffer_params.full_x = rtile.x;
+		buffer_params.full_y = rtile.y;
+		buffer_params.width = rtile.w;
+		buffer_params.height = rtile.h;
+
+		/* allocate buffers */
+		if(params.progressive_refine) {
+			tile_lock.lock();
+
+			if(render_tiles.size() == 0) {
+				RenderTile nulltile;
+				nulltile.buffers = NULL;
+				render_tiles.resize(tile_manager.state.num_tiles, nulltile);
+			}
 
-	/* allocate buffers */
-	if(params.progressive_refine) {
-		tile_lock.lock();
+			/* In certain circumstances number of tiles in the tile manager could
+			 * be changed. This is not supported by the progressive refine feature.
+			 */
+			assert(render_tiles.size() == tile_manager.state.num_tiles);
 
-		if(tile_buffers.size() == 0)
-			tile_buffers.resize(tile_manager.state.num_tiles, NULL);
+			RenderTile &stored_rtile = render_tiles[tile->index];
+			if(stored_rtile.buffers == NULL) {
+				tile->buffers = new RenderBuffers(tile_device);
+				tile->buffers->reset(tile_device, buffer_params);
+				store_rtile = true;
+			}
+			else {
+				assert(rtile.x == stored_rtile.x &&
+				       rtile.y == stored_rtile.y &&
+				       rtile.w == stored_rtile.w &&
+				       rtile.h == stored_rtile.h);
+				tile_lock.unlock();
+				tile->buffers = stored_rtile.buffers;
+			}
+		}
+		else {
+			tile->buffers = new RenderBuffers(tile_device);
 
-		/* In certain circumstances number of tiles in the tile manager could
-		 * be changed. This is not supported by the progressive refine feature.
-		 */
-		assert(tile_buffers.size() == tile_manager.state.num_tiles);
+			tile->buffers->reset(tile_device, buffer_params);
+		}
+	}
 
-		tilebuffers = tile_buffers[tile.index];
-		if(tilebuffers == NULL) {
-			tilebuffers = new RenderBuffers(tile_device);
-			tile_buffers[tile.index] = tilebuffers;
+	tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
 
-			tilebuffers->reset(tile_device, buffer_params);
-		}
+	rtile.buffer = tile->buffers->buffer.device_pointer;
+	rtile.rng_state = tile->buffers->rng_state.device_pointer;
+	rtile.buffers = tile->buffers;
+	rtile.sample = 0;
 
+	if(store_rtile) {
+		render_tiles[tile->index] = rtile;
 		tile_lock.unlock();
 	}
-	else {
-		tilebuffers = new RenderBuffers(tile_device);
-
-		tilebuffers->reset(tile_device, buffer_params);
-	}
-
-	rtile.buffer = tilebuffers->buffer.device_pointer;
-	rtile.rng_state = tilebuffers->rng_state.device_pointer;
-	rtile.buffers = tilebuffers;
 
 	/* this will tag tile as IN PROGRESS in blender-side render pipeline,
 	 * which is needed to highlight currently rendering tile before first
@@ -449,7 +468,7 @@ void Session::update_tile_sample(RenderTile& rtile)
 		if(params.progressive_refine == false) {
 			/* todo: optimize this by making it thread safe and removing lock */
 
-			update_render_tile_cb(rtile);
+			update_render_tile_cb(rtile, true);
 		}
 	}
 
@@ -460,20 +479,77 @@ void Session::release_tile(RenderTile& rtile)
 {
 	thread_scoped_lock tile_lock(tile_mutex);
 
-	progress.add_finished_tile();
+	progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
 
-	if(write_render_tile_cb) {
-		if(params.progressive_refine == false) {
-			/* todo: optimize this by making it thread safe and removing lock */
-			write_render_tile_cb(rtile);
+	bool delete_tile;
 
-			delete rtile.buffers;
+	if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) {
+		if(write_render_tile_cb && params.progressive_refine == false) {
+			write_render_tile_cb(rtile);
+			if(delete_tile) {
+				delete rtile.buffers;
+				tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
+			}
+		}
+	}
+	else {
+		if(update_render_tile_cb && params.progressive_refine == false) {
+			update_render_tile_cb(rtile, false);
 		}
 	}
 
 	update_status_time();
 }
 
+void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+	thread_scoped_lock tile_lock(tile_mutex);
+
+	int center_idx = tiles[4].tile_index;
+	assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+	BufferParams buffer_params = tile_manager.params;
+	int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y,
+	                              buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height);
+
+	for(int dy = -1, i = 0; dy <= 1; dy++) {
+		for(int dx = -1; dx <= 1; dx++, i++) {
+			int px = tiles[4].x + dx*params.tile_size.x;
+			int py = tiles[4].y + dy*params.tile_size.y;
+			if(px >= image_region.x && py >= image_region.y &&
+			   px <  image_region.z && py <  image_region.w) {
+				int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx;
+				Tile *tile = &tile_manager.state.tiles[tile_index];
+				assert(tile->buffers);
+
+				tiles[i].buffer = tile->buffers->buffer.device_pointer;
+				tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
+				tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
+				tiles[i].w = tile->w;
+				tiles[i].h = tile->h;
+				tiles[i].buffers = tile->buffers;
+
+				tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
+			}
+			else {
+				tiles[i].buffer = (device_ptr)NULL;
+				tiles[i].buffers = NULL;
+				tiles[i].x = clamp(px, image_region.x, image_region.z);
+				tiles[i].y = clamp(py, image_region.y, image_region.w);
+				tiles[i].w = tiles[i].h = 0;
+			}
+		}
+	}
+
+	assert(tiles[4].buffers);
+	device->map_neighbor_tiles(tile_device, tiles);
+}
+
+void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+	thread_scoped_lock tile_lock(tile_mutex);
+	device->unmap_neighbor_tiles(tile_device, tiles);
+}
+
 void Session::run_cpu()
 {
 	bool tiles_written = false;
@@ -558,8 +634,8 @@ void Session::run_cpu()
 			/* update status and timing */
 			update_status_time();
 
-			/* path trace */
-			path_trace();
+			/* render */
+			render();
 
 			/* update status and timing */
 			update_status_time();
@@ -646,20 +722,25 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
 	requested_features.use_transparent &= scene->integrator->transparent_shadows;
+	requested_features.use_denoising = params.use_denoising;
 
 	return requested_features;
 }
 
-void Session::load_kernels()
+void Session::load_kernels(bool lock_scene)
 {
-	thread_scoped_lock scene_lock(scene->mutex);
+	thread_scoped_lock scene_lock;
+	if(lock_scene) {
+		scene_lock = thread_scoped_lock(scene->mutex);
+	}
+
+	DeviceRequestedFeatures requested_features = get_requested_device_features();
 
-	if(!kernels_loaded) {
+	if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
 		scoped_timer timer;
 
-		DeviceRequestedFeatures requested_features = get_requested_device_features();
 		VLOG(2) << "Requested features:\n" << requested_features;
 		if(!device->load_kernels(requested_features)) {
 			string message = device->error_message();
@@ -676,6 +757,7 @@ void Session::load_kernels()
 		VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
 
 		kernels_loaded = true;
+		loaded_kernel_features = requested_features;
 	}
 }
 
@@ -744,10 +826,10 @@ void Session::reset(BufferParams& buffer_params, int samples)
 	if(params.progressive_refine) {
 		thread_scoped_lock buffers_lock(buffers_mutex);
 
-		foreach(RenderBuffers *buffers, tile_buffers)
-			delete buffers;
+		foreach(RenderTile &rtile, render_tiles)
+			delete rtile.buffers;
 
-		tile_buffers.clear();
+		render_tiles.clear();
 	}
 }
 
@@ -826,6 +908,8 @@ void Session::update_scene()
 
 	/* update scene */
 	if(scene->need_update()) {
+		load_kernels(false);
+
 		progress.set_status("Updating Scene");
 		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
 	}
@@ -836,7 +920,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	int progressive_sample = tile_manager.state.sample;
 	int num_samples = tile_manager.get_num_effective_samples();
 
-	int tile = progress.get_finished_tiles();
+	int tile = progress.get_rendered_tiles();
 	int num_tiles = tile_manager.state.num_tiles;
 
 	/* update status */
@@ -844,11 +928,12 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 	if(!params.progressive) {
 		const bool is_cpu = params.device.type == DEVICE_CPU;
+		const bool rendering_finished = (tile == num_tiles);
 		const bool is_last_tile = (tile + 1) == num_tiles;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
-		if(device->show_samples() || (is_cpu && is_last_tile)) {
+		if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
 			/* Some devices automatically support showing the sample number:
 			 * - CUDADevice
 			 * - OpenCLDevice when using the megakernel (the split kernel renders multiple
@@ -860,6 +945,9 @@ void Session::update_status_time(bool show_pause, bool show_done)
 			 */
 			substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
 		}
+		if(params.use_denoising) {
+			substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
+		}
 	}
 	else if(tile_manager.num_samples == INT_MAX)
 		substatus = string_printf("Path Tracing Sample %d", progressive_sample+1);
@@ -873,6 +961,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	}
 	else if(show_done) {
 		status = "Done";
+		progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */
 	}
 	else {
 		status = substatus;
@@ -882,13 +971,15 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	progress.set_status(status, substatus);
 }
 
-void Session::path_trace()
+void Session::render()
 {
 	/* add path trace task */
-	DeviceTask task(DeviceTask::PATH_TRACE);
+	DeviceTask task(DeviceTask::RENDER);
 	
 	task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
 	task.release_tile = function_bind(&Session::release_tile, this, _1);
+	task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
+	task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
 	task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
 	task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
 	task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
@@ -897,6 +988,18 @@ void Session::path_trace()
 	task.requested_tile_size = params.tile_size;
 	task.passes_size = tile_manager.params.get_passes_size();
 
+	if(params.use_denoising) {
+		task.denoising_radius = params.denoising_radius;
+		task.denoising_strength = params.denoising_strength;
+		task.denoising_feature_strength = params.denoising_feature_strength;
+		task.denoising_relative_pca = params.denoising_relative_pca;
+
+		assert(!scene->film->need_update);
+		task.pass_stride = scene->film->pass_stride;
+		task.pass_denoising_data = scene->film->denoising_data_offset;
+		task.pass_denoising_clean = scene->film->denoising_clean_offset;
+	}
+
 	device->task_add(task);
 }
 
@@ -940,9 +1043,7 @@ bool Session::update_progressive_refine(bool cancel)
 	}
 
 	if(params.progressive_refine) {
-		foreach(RenderBuffers *buffers, tile_buffers) {
-			RenderTile rtile;
-			rtile.buffers = buffers;
+		foreach(RenderTile &rtile, render_tiles) {
 			rtile.sample = sample;
 
 			if(write) {
@@ -951,7 +1052,7 @@ bool Session::update_progressive_refine(bool cancel)
 			}
 			else {
 				if(update_render_tile_cb)
-					update_render_tile_cb(rtile);
+					update_render_tile_cb(rtile, true);
 			}
 		}
 	}
@@ -965,10 +1066,11 @@ void Session::device_free()
 {
 	scene->device_free();
 
-	foreach(RenderBuffers *buffers, tile_buffers)
-		delete buffers;
+	foreach(RenderTile &tile, render_tiles)
+		delete tile.buffers;
+	tile_manager.free_device();
 
-	tile_buffers.clear();
+	render_tiles.clear();
 
 	/* used from background render only, so no need to
 	 * re-create render/display buffers here
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index a7e5f78a64d..9f8bb8c42fa 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -57,6 +57,12 @@ public:
 
 	bool display_buffer_linear;
 
+	bool use_denoising;
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+
 	double cancel_timeout;
 	double reset_timeout;
 	double text_timeout;
@@ -77,6 +83,12 @@ public:
 		start_resolution = INT_MAX;
 		threads = 0;
 
+		use_denoising = false;
+		denoising_radius = 8;
+		denoising_strength = 0.0f;
+		denoising_feature_strength = 0.0f;
+		denoising_relative_pca = false;
+
 		display_buffer_linear = false;
 
 		cancel_timeout = 0.1;
@@ -126,7 +138,7 @@ public:
 	Stats stats;
 
 	function<void(RenderTile&)> write_render_tile_cb;
-	function<void(RenderTile&)> update_render_tile_cb;
+	function<void(RenderTile&, bool)> update_render_tile_cb;
 
 	explicit Session(const SessionParams& params);
 	~Session();
@@ -141,7 +153,7 @@ public:
 	void set_pause(bool pause);
 
 	void update_scene();
-	void load_kernels();
+	void load_kernels(bool lock_scene=true);
 
 	void device_free();
 
@@ -162,7 +174,7 @@ protected:
 	void update_status_time(bool show_pause = false, bool show_done = false);
 
 	void tonemap(int sample);
-	void path_trace();
+	void render();
 	void reset_(BufferParams& params, int samples);
 
 	void run_cpu();
@@ -177,6 +189,9 @@ protected:
 	void update_tile_sample(RenderTile& tile);
 	void release_tile(RenderTile& tile);
 
+	void map_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+	void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+
 	bool device_use_gl;
 
 	thread *session_thread;
@@ -195,6 +210,7 @@ protected:
 	thread_mutex display_mutex;
 
 	bool kernels_loaded;
+	DeviceRequestedFeatures loaded_kernel_features;
 
 	double reset_time;
 
@@ -202,7 +218,7 @@ protected:
 	double last_update_time;
 	bool update_progressive_refine(bool cancel);
 
-	vector<RenderBuffers *> tile_buffers;
+	vector<RenderTile> render_tiles;
 
 	DeviceRequestedFeatures get_requested_device_features();
 
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 23eee1916bd..44a266dfe18 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -49,6 +49,16 @@ static float beckmann_table_slope_max()
 	return 6.0;
 }
 
+
+/* MSVC 2015 needs this ugly hack to prevent a codegen bug on x86
+ * see T50176 for details
+ */
+#if defined(_MSC_VER) && (_MSC_VER == 1900)
+#  define MSVC_VOLATILE volatile
+#else
+#  define MSVC_VOLATILE
+#endif
+
 /* Paper used: Importance Sampling Microfacet-Based BSDFs with the
  * Distribution of Visible Normals. Supplemental Material 2/2.
  *
@@ -72,7 +82,7 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
 		slope_x[0] = (double)-beckmann_table_slope_max();
 		CDF_P22_omega_i[0] = 0;
 
-		for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
+		for(MSVC_VOLATILE int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
 			/* slope_x */
 			slope_x[index_slope_x] = (double)(-beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f));
 
@@ -116,6 +126,8 @@ static void beckmann_table_rows(float *table, int row_from, int row_to)
 	}
 }
 
+#undef MSVC_VOLATILE
+
 static void beckmann_table_build(vector<float>& table)
 {
 	table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE);
@@ -178,6 +190,7 @@ Shader::Shader()
 	has_volume_spatial_varying = false;
 	has_object_dependency = false;
 	has_integrator_dependency = false;
+	has_volume_connected = false;
 
 	displacement_method = DISPLACE_BUMP;
 
@@ -229,6 +242,10 @@ void Shader::set_graph(ShaderGraph *graph_)
 	delete graph_bump;
 	graph = graph_;
 	graph_bump = NULL;
+
+	/* Store info here before graph optimization to make sure that
+	 * nodes that get optimized away still count. */
+	has_volume_connected = (graph->output()->input("Volume")->link != NULL);
 }
 
 void Shader::tag_update(Scene *scene)
@@ -319,11 +336,14 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
 	(void)shadingsystem;  /* Ignored when built without OSL. */
 
 #ifdef WITH_OSL
-	if(shadingsystem == SHADINGSYSTEM_OSL)
+	if(shadingsystem == SHADINGSYSTEM_OSL) {
 		manager = new OSLShaderManager();
+	}
 	else
 #endif
+	{
 		manager = new SVMShaderManager();
+	}
 	
 	add_default(scene);
 
@@ -420,15 +440,14 @@ void ShaderManager::device_update_common(Device *device,
 			flag |= SD_HAS_VOLUME;
 			has_volumes = true;
 
-			/* in this case we can assume transparent surface */
-			if(!shader->has_surface)
-				flag |= SD_HAS_ONLY_VOLUME;
-
 			/* todo: this could check more fine grained, to skip useless volumes
 			 * enclosed inside an opaque bsdf.
 			 */
 			flag |= SD_HAS_TRANSPARENT_SHADOW;
 		}
+		/* in this case we can assume transparent surface */
+		if(shader->has_volume_connected && !shader->has_surface)
+			flag |= SD_HAS_ONLY_VOLUME;
 		if(shader->heterogeneous_volume && shader->has_volume_spatial_varying)
 			flag |= SD_HETEROGENEOUS_VOLUME;
 		if(shader->has_bssrdf_bump)
@@ -569,6 +588,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
 			if(CLOSURE_IS_VOLUME(bsdf_node->closure)) {
 				requested_features->nodes_features |= NODE_FEATURE_VOLUME;
 			}
+			else if(CLOSURE_IS_PRINCIPLED(bsdf_node->closure)) {
+				requested_features->use_principled = true;
+			}
 		}
 		if(node->has_surface_bssrdf()) {
 			requested_features->use_subsurface = true;
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index a8018231f1a..b6714b13247 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -105,6 +105,15 @@ public:
 	bool need_update;
 	bool need_update_attributes;
 
+	/* If the shader has only volume components, the surface is assumed to
+	 * be transparent.
+	 * However, graph optimization might remove the volume subgraph, but
+	 * since the user connected something to the volume output the surface
+	 * should still be transparent.
+	 * Therefore, has_volume_connected stores whether some volume subtree
+	 * was connected before optimization. */
+	bool has_volume_connected;
+
 	/* information about shader after compiling */
 	bool has_surface;
 	bool has_surface_emission;
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 944e746ca2d..176a1f4f0f3 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -25,37 +25,39 @@ namespace {
 
 class TileComparator {
 public:
-	TileComparator(TileOrder order, int2 center)
-	 :  order_(order),
-	    center_(center)
+	TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
+	 :  order(order_),
+	    center(center_),
+	    tiles(tiles_)
 	{}
 
-	bool operator()(Tile &a, Tile &b)
+	bool operator()(int a, int b)
 	{
-		switch(order_) {
+		switch(order) {
 			case TILE_CENTER:
 			{
-				float2 dist_a = make_float2(center_.x - (a.x + a.w/2),
-				                            center_.y - (a.y + a.h/2));
-				float2 dist_b = make_float2(center_.x - (b.x + b.w/2),
-				                            center_.y - (b.y + b.h/2));
+				float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2),
+				                            center.y - (tiles[a].y + tiles[a].h/2));
+				float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2),
+				                            center.y - (tiles[b].y + tiles[b].h/2));
 				return dot(dist_a, dist_a) < dot(dist_b, dist_b);
 			}
 			case TILE_LEFT_TO_RIGHT:
-				return (a.x == b.x)? (a.y < b.y): (a.x < b.x);
+				return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x);
 			case TILE_RIGHT_TO_LEFT:
-				return (a.x == b.x)? (a.y < b.y): (a.x > b.x);
+				return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x);
 			case TILE_TOP_TO_BOTTOM:
-				return (a.y == b.y)? (a.x < b.x): (a.y > b.y);
+				return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y);
 			case TILE_BOTTOM_TO_TOP:
 			default:
-				return (a.y == b.y)? (a.x < b.x): (a.y < b.y);
+				return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y);
 		}
 	}
 
 protected:
-	TileOrder order_;
-	int2 center_;
+	TileOrder order;
+	int2 center;
+	Tile *tiles;
 };
 
 inline int2 hilbert_index_to_pos(int n, int d)
@@ -96,6 +98,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i
 	num_devices = num_devices_;
 	preserve_tile_device = preserve_tile_device_;
 	background = background_;
+	schedule_denoising = false;
 
 	range_start_sample = 0;
 	range_num_samples = -1;
@@ -108,6 +111,16 @@ TileManager::~TileManager()
 {
 }
 
+void TileManager::free_device()
+{
+	if(schedule_denoising) {
+		for(int i = 0; i < state.tiles.size(); i++) {
+			delete state.tiles[i].buffers;
+			state.tiles[i].buffers = NULL;
+		}
+	}
+}
+
 static int get_divider(int w, int h, int start_resolution)
 {
 	int divider = 1;
@@ -133,6 +146,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 	state.num_tiles = 0;
 	state.num_samples = 0;
 	state.resolution_divider = get_divider(params.width, params.height, start_resolution);
+	state.render_tiles.clear();
+	state.denoising_tiles.clear();
 	state.tiles.clear();
 }
 
@@ -157,6 +172,9 @@ void TileManager::set_samples(int num_samples_)
 		}
 
 		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+		if(schedule_denoising) {
+			state.total_pixel_samples += params.width*params.height;
+		}
 	}
 }
 
@@ -169,32 +187,36 @@ int TileManager::gen_tiles(bool sliced)
 	int image_h = max(1, params.height/resolution);
 	int2 center = make_int2(image_w/2, image_h/2);
 
-	state.tiles.clear();
-
 	int num_logical_devices = preserve_tile_device? num_devices: 1;
 	int num = min(image_h, num_logical_devices);
 	int slice_num = sliced? num: 1;
-	int tile_index = 0;
+	int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
 
 	state.tiles.clear();
-	state.tiles.resize(num);
-	vector<list<Tile> >::iterator tile_list = state.tiles.begin();
+	state.render_tiles.clear();
+	state.denoising_tiles.clear();
+	state.render_tiles.resize(num);
+	state.denoising_tiles.resize(num);
+	state.tile_stride = tile_w;
+	vector<list<int> >::iterator tile_list;
+	tile_list = state.render_tiles.begin();
 
 	if(tile_order == TILE_HILBERT_SPIRAL) {
 		assert(!sliced);
 
+		int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+		state.tiles.resize(tile_w*tile_h);
+
 		/* Size of blocks in tiles, must be a power of 2 */
 		const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4;
 
-		int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
-		int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y;
-		int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+		int tiles_per_device = divide_up(tile_w * tile_h, num);
 		int cur_device = 0, cur_tiles = 0;
 
 		int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
 		/* Number of blocks to fill the image */
-		int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x;
-		int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y;
+		int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x);
+		int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y);
 		int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
 		/* Offset of spiral (to keep it centered) */
 		int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2);
@@ -225,9 +247,11 @@ int TileManager::gen_tiles(bool sliced)
 				if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
 					int w = min(tile_size.x, image_w - pos.x);
 					int h = min(tile_size.y, image_h - pos.y);
-					tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device));
+					int2 ipos = pos / tile_size;
+					int idx = ipos.y*tile_w + ipos.x;
+					state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
+					tile_list->push_front(idx);
 					cur_tiles++;
-					tile_index++;
 
 					if(cur_tiles == tiles_per_device) {
 						tile_list++;
@@ -271,27 +295,28 @@ int TileManager::gen_tiles(bool sliced)
 					break;
 			}
 		}
-		return tile_index;
+		return tile_w*tile_h;
 	}
 
+	int idx = 0;
 	for(int slice = 0; slice < slice_num; slice++) {
 		int slice_y = (image_h/slice_num)*slice;
 		int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num;
 
-		int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
-		int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y;
+		int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y);
 
-		int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+		int tiles_per_device = divide_up(tile_w * tile_h, num);
 		int cur_device = 0, cur_tiles = 0;
 
 		for(int tile_y = 0; tile_y < tile_h; tile_y++) {
-			for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) {
+			for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
 				int x = tile_x * tile_size.x;
 				int y = tile_y * tile_size.y;
 				int w = (tile_x == tile_w-1)? image_w - x: tile_size.x;
 				int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y;
 
-				tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device));
+				state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER));
+				tile_list->push_back(idx);
 
 				if(!sliced) {
 					cur_tiles++;
@@ -299,7 +324,7 @@ int TileManager::gen_tiles(bool sliced)
 					if(cur_tiles == tiles_per_device) {
 						/* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */
 						if(tile_order != TILE_BOTTOM_TO_TOP) {
-							tile_list->sort(TileComparator(tile_order, center));
+							tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
 						}
 						tile_list++;
 						cur_tiles = 0;
@@ -313,7 +338,7 @@ int TileManager::gen_tiles(bool sliced)
 		}
 	}
 
-	return tile_index;
+	return idx;
 }
 
 void TileManager::set_tiles()
@@ -333,15 +358,111 @@ void TileManager::set_tiles()
 	state.buffer.full_height = max(1, params.full_height/resolution);
 }
 
-bool TileManager::next_tile(Tile& tile, int device)
+int TileManager::get_neighbor_index(int index, int neighbor)
+{
+	static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+
+	int resolution = state.resolution_divider;
+	int image_w = max(1, params.width/resolution);
+	int image_h = max(1, params.height/resolution);
+	int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x);
+	int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y);
+
+	int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor];
+	if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+		return -1;
+
+	return ny*state.tile_stride + nx;
+}
+
+/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */
+bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+{
+	if(index < 0 || state.tiles[index].state < min_state) {
+		return false;
+	}
+	for(int neighbor = 0; neighbor < 9; neighbor++) {
+		int nindex = get_neighbor_index(index, neighbor);
+		/* Out-of-bounds tiles don't matter. */
+		if(nindex >= 0 && state.tiles[nindex].state < min_state) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */
+bool TileManager::finish_tile(int index, bool &delete_tile)
+{
+	delete_tile = false;
+
+	switch(state.tiles[index].state) {
+		case Tile::RENDER:
+		{
+			if(!schedule_denoising) {
+				state.tiles[index].state = Tile::DONE;
+				delete_tile = true;
+				return true;
+			}
+			state.tiles[index].state = Tile::RENDERED;
+			/* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */
+			for(int neighbor = 0; neighbor < 9; neighbor++) {
+				int nindex = get_neighbor_index(index, neighbor);
+				if(check_neighbor_state(nindex, Tile::RENDERED)) {
+					state.tiles[nindex].state = Tile::DENOISE;
+					state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
+				}
+			}
+			return false;
+		}
+		case Tile::DENOISE:
+		{
+			state.tiles[index].state = Tile::DENOISED;
+			/* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */
+			for(int neighbor = 0; neighbor < 9; neighbor++) {
+				int nindex = get_neighbor_index(index, neighbor);
+				if(check_neighbor_state(nindex, Tile::DENOISED)) {
+					state.tiles[nindex].state = Tile::DONE;
+					/* It can happen that the tile just finished denoising and already can be freed here.
+					 * However, in that case it still has to be written before deleting, so we can't delete it yet. */
+					if(neighbor == 8) {
+						delete_tile = true;
+					}
+					else {
+						delete state.tiles[nindex].buffers;
+						state.tiles[nindex].buffers = NULL;
+					}
+				}
+			}
+			return true;
+		}
+		default:
+			assert(false);
+			return true;
+	}
+}
+
+bool TileManager::next_tile(Tile* &tile, int device)
 {
 	int logical_device = preserve_tile_device? device: 0;
 
-	if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty())
+	if(logical_device >= state.render_tiles.size())
+		return false;
+
+	if(!state.denoising_tiles[logical_device].empty()) {
+		int idx = state.denoising_tiles[logical_device].front();
+		state.denoising_tiles[logical_device].pop_front();
+		tile = &state.tiles[idx];
+		return true;
+	}
+
+	if(state.render_tiles[logical_device].empty())
 		return false;
 
-	tile = Tile(state.tiles[logical_device].front());
-	state.tiles[logical_device].pop_front();
+	int idx = state.render_tiles[logical_device].front();
+	state.render_tiles[logical_device].pop_front();
+	tile = &state.tiles[idx];
 	return true;
 }
 
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 622b89f7670..e39a8f0627a 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -31,12 +31,20 @@ public:
 	int index;
 	int x, y, w, h;
 	int device;
+	/* RENDER: The tile has to be rendered.
+	 * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
+	 * DENOISE: The tile can be denoised now.
+	 * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
+	 * DONE: The tile is finished and has been freed. */
+	typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
+	State state;
+	RenderBuffers *buffers;
 
 	Tile()
 	{}
 
-	Tile(int index_, int x_, int y_, int w_, int h_, int device_)
-	: index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {}
+	Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
+	: index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {}
 };
 
 /* Tile order */
@@ -58,6 +66,8 @@ public:
 	BufferParams params;
 
 	struct State {
+		vector<Tile> tiles;
+		int tile_stride;
 		BufferParams buffer;
 		int sample;
 		int num_samples;
@@ -67,9 +77,12 @@ public:
 		/* Total samples over all pixels: Generally num_samples*num_pixels,
 		 * but can be higher due to the initial resolution division for previews. */
 		uint64_t total_pixel_samples;
-		/* This vector contains a list of tiles for every logical device in the session.
-		 * In each list, the tiles are sorted according to the tile order setting. */
-		vector<list<Tile> > tiles;
+
+		/* These lists contain the indices of the tiles to be rendered/denoised and are used
+		 * when acquiring a new tile for the device.
+		 * Each list in each vector is for one logical device. */
+		vector<list<int> > render_tiles;
+		vector<list<int> > denoising_tiles;
 	} state;
 
 	int num_samples;
@@ -78,10 +91,12 @@ public:
 	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
 	~TileManager();
 
+	void free_device();
 	void reset(BufferParams& params, int num_samples);
 	void set_samples(int num_samples);
 	bool next();
-	bool next_tile(Tile& tile, int device = 0);
+	bool next_tile(Tile* &tile, int device = 0);
+	bool finish_tile(int index, bool& delete_tile);
 	bool done();
 
 	void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; }
@@ -96,6 +111,9 @@ public:
 
 	/* Get number of actual samples to render. */
 	int get_num_effective_samples();
+
+	/* Schedule tiles for denoising after they've been rendered. */
+	bool schedule_denoising;
 protected:
 
 	void set_tiles();
@@ -127,6 +145,9 @@ protected:
 
 	/* Generate tile list, return number of tiles. */
 	int gen_tiles(bool sliced);
+
+	int get_neighbor_index(int index, int neighbor);
+	bool check_neighbor_state(int index, Tile::State state);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 22ec8e0ee8e..6c059ba5d12 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -245,4 +245,41 @@ TEST(util_string_remove_trademark, both)
 	EXPECT_EQ(str, "foo bar zzz");
 }
 
+TEST(util_string_remove_trademark, both_space)
+{
+	string str = string_remove_trademark("foo bar(TM) (R) zzz");
+	EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, both_space_around)
+{
+	string str = string_remove_trademark("foo bar (TM) (R) zzz");
+	EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, trademark_space_suffix)
+{
+	string str = string_remove_trademark("foo bar (TM)");
+	EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, trademark_space_middle)
+{
+	string str = string_remove_trademark("foo bar (TM) baz");
+	EXPECT_EQ(str, "foo bar baz");
+}
+
+
+TEST(util_string_remove_trademark, r_space_suffix)
+{
+	string str = string_remove_trademark("foo bar (R)");
+	EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, r_space_middle)
+{
+	string str = string_remove_trademark("foo bar (R) baz");
+	EXPECT_EQ(str, "foo bar baz");
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index a015fef8284..43f9a57d099 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -53,6 +53,13 @@ set(SRC_HEADERS
 	util_math_cdf.h
 	util_math_fast.h
 	util_math_intersect.h
+	util_math_float2.h
+	util_math_float3.h
+	util_math_float4.h
+	util_math_int2.h
+	util_math_int3.h
+	util_math_int4.h
+	util_math_matrix.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
@@ -80,6 +87,32 @@ set(SRC_HEADERS
 	util_time.h
 	util_transform.h
 	util_types.h
+	util_types_float2.h
+	util_types_float2_impl.h
+	util_types_float3.h
+	util_types_float3_impl.h
+	util_types_float4.h
+	util_types_float4_impl.h
+	util_types_int2.h
+	util_types_int2_impl.h
+	util_types_int3.h
+	util_types_int3_impl.h
+	util_types_int4.h
+	util_types_int4_impl.h
+	util_types_uchar2.h
+	util_types_uchar2_impl.h
+	util_types_uchar3.h
+	util_types_uchar3_impl.h
+	util_types_uchar4.h
+	util_types_uchar4_impl.h
+	util_types_uint2.h
+	util_types_uint2_impl.h
+	util_types_uint3.h
+	util_types_uint3_impl.h
+	util_types_uint4.h
+	util_types_uint4_impl.h
+	util_types_vector3.h
+	util_types_vector3_impl.h
 	util_vector.h
 	util_version.h
 	util_view.h
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 6c52117ef9a..643af87a65f 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -35,6 +35,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
 
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
 
 #define CCL_LOCAL_MEM_FENCE 0
 #define ccl_barrier(flags) (void)0
@@ -68,6 +69,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 
 #define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+#define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
 
 #define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
 #define ccl_barrier(flags) barrier(flags)
@@ -79,7 +81,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 #define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
 
 #define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x))
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
 
 #define CCL_LOCAL_MEM_FENCE
 #define ccl_barrier(flags) __syncthreads()
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 4d673dc34d8..c73beab98dc 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -157,16 +157,6 @@ ccl_device float3 xyz_to_rgb(float x, float y, float z)
 	                   0.055648f * x + -0.204043f * y +  1.057311f * z);
 }
 
-#ifndef __KERNEL_OPENCL__
-
-ccl_device float3 color_srgb_to_scene_linear(float3 c)
-{
-	return make_float3(
-		color_srgb_to_scene_linear(c.x),
-		color_srgb_to_scene_linear(c.y),
-		color_srgb_to_scene_linear(c.z));
-}
-
 #ifdef __KERNEL_SSE2__
 /*
  * Calculate initial guess for arg^exp based on float representation
@@ -222,17 +212,38 @@ ccl_device ssef color_srgb_to_scene_linear(const ssef &c)
 	ssef gte = fastpow24(gtebase);
 	return select(cmp, lt, gte);
 }
-#endif
+#endif  /* __KERNEL_SSE2__ */
 
-ccl_device float3 color_scene_linear_to_srgb(float3 c)
+ccl_device float3 color_srgb_to_scene_linear_v3(float3 c)
 {
-	return make_float3(
-		color_scene_linear_to_srgb(c.x),
-		color_scene_linear_to_srgb(c.y),
-		color_scene_linear_to_srgb(c.z));
+	return make_float3(color_srgb_to_scene_linear(c.x),
+	                   color_srgb_to_scene_linear(c.y),
+	                   color_srgb_to_scene_linear(c.z));
 }
 
+ccl_device float3 color_scene_linear_to_srgb_v3(float3 c)
+{
+	return make_float3(color_scene_linear_to_srgb(c.x),
+	                   color_scene_linear_to_srgb(c.y),
+	                   color_scene_linear_to_srgb(c.z));
+}
+
+ccl_device float4 color_srgb_to_scene_linear_v4(float4 c)
+{
+#ifdef __KERNEL_SSE2__
+	ssef r_ssef;
+	float4 &r = (float4 &)r_ssef;
+	r = c;
+	r_ssef = color_srgb_to_scene_linear(r_ssef);
+	r.w = c.w;
+	return r;
+#else
+	return make_float4(color_srgb_to_scene_linear(c.x),
+	                   color_srgb_to_scene_linear(c.y),
+	                   color_srgb_to_scene_linear(c.z),
+	                   c.w);
 #endif
+}
 
 ccl_device float linear_rgb_to_gray(float3 c)
 {
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 9cfa57dd741..10895f2e918 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -118,7 +118,7 @@ void DebugFlags::OpenCL::reset()
 	}
 	/* Initialize other flags from environment variables. */
 	debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
-	single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL);
+	single_program = (getenv("CYCLES_OPENCL_MULTI_PROGRAM") == NULL);
 }
 
 DebugFlags::DebugFlags()
@@ -184,8 +184,8 @@ std::ostream& operator <<(std::ostream &os,
 	   << "  Device type    : " << opencl_device_type << "\n"
 	   << "  Kernel type    : " << opencl_kernel_type << "\n"
 	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
-	   << "  Signle program : " << string_from_bool(debug_flags.opencl.single_program)
-	   << "\n";
+	   << "  Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
+	   << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
 	return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 4505d584490..450cd900a9f 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -115,6 +115,10 @@ public:
 
 		/* Use single program */
 		bool single_program;
+
+		/* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
+		/* Artificial memory limit in bytes (0 if disabled). */
+		size_t mem_limit;
 	};
 
 	/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 5f9dcfb2481..1abcabd5294 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -50,9 +50,9 @@ public:
 
 	T *allocate(size_t n, const void *hint = 0)
 	{
+		(void)hint;
 		size_t size = n * sizeof(T);
 		util_guarded_mem_alloc(size);
-		(void)hint;
 		if(n == 0) {
 			return NULL;
 		}
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index a5a3bd34fff..f38683bf7de 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -30,10 +30,10 @@ void util_logging_init(const char *argv0)
 #ifdef WITH_CYCLES_LOGGING
 	using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
 
-	/* Make it so FATAL messages are always print into console. */
+	/* Make it so ERROR messages are always print into console. */
 	char severity_fatal[32];
 	snprintf(severity_fatal, sizeof(severity_fatal), "%d",
-	         google::GLOG_FATAL);
+	         google::GLOG_ERROR);
 
 	google::InitGoogleLogging(argv0);
 	SetCommandLineOption("logtostderr", "1");
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index ecf9c9cfee0..492f830e67c 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -19,28 +19,30 @@
 
 #if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__)
 #  include <glog/logging.h>
-#else
-#  include <iostream>
 #endif
 
+#include <iostream>
+
 CCL_NAMESPACE_BEGIN
 
 #if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__)
-class StubStream : public std::ostream {
- public:
-	StubStream() : std::ostream(NULL) { }
+class StubStream {
+public:
+	template<class T>
+	StubStream& operator<<(const T&) {
+		return *this;
+	}
 };
 
 class LogMessageVoidify {
 public:
 	LogMessageVoidify() { }
-	void operator&(::std::ostream&) { }
+	void operator&(StubStream&) { }
 };
 
 #  define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
-
 #endif
 
 #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index e0305b978b9..b719640b19c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -28,12 +28,10 @@
 
 
 #ifndef __KERNEL_OPENCL__
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-
-#endif
+#  include <float.h>
+#  include <math.h>
+#  include <stdio.h>
+#endif  /* __KERNEL_OPENCL__ */
 
 #include "util/util_types.h"
 
@@ -43,49 +41,44 @@ CCL_NAMESPACE_BEGIN
 
 /* Division */
 #ifndef M_PI_F
-#define M_PI_F    (3.1415926535897932f)  /* pi */
+#  define M_PI_F    (3.1415926535897932f)  /* pi */
 #endif
 #ifndef M_PI_2_F
-#define M_PI_2_F  (1.5707963267948966f)  /* pi/2 */
+#  define M_PI_2_F  (1.5707963267948966f)  /* pi/2 */
 #endif
 #ifndef M_PI_4_F
-#define M_PI_4_F  (0.7853981633974830f)  /* pi/4 */
+#  define M_PI_4_F  (0.7853981633974830f)  /* pi/4 */
 #endif
 #ifndef M_1_PI_F
-#define M_1_PI_F  (0.3183098861837067f)  /* 1/pi */
+#  define M_1_PI_F  (0.3183098861837067f)  /* 1/pi */
 #endif
 #ifndef M_2_PI_F
-#define M_2_PI_F  (0.6366197723675813f)  /* 2/pi */
+#  define M_2_PI_F  (0.6366197723675813f)  /* 2/pi */
 #endif
 
 /* Multiplication */
 #ifndef M_2PI_F
-#define M_2PI_F   (6.2831853071795864f)  /* 2*pi */
+#  define M_2PI_F   (6.2831853071795864f)  /* 2*pi */
 #endif
 #ifndef M_4PI_F
-#define M_4PI_F   (12.566370614359172f)  /* 4*pi */
+#  define M_4PI_F   (12.566370614359172f)  /* 4*pi */
 #endif
 
 /* Float sqrt variations */
-
 #ifndef M_SQRT2_F
-#define M_SQRT2_F (1.4142135623730950f)  /* sqrt(2) */
+#  define M_SQRT2_F (1.4142135623730950f)  /* sqrt(2) */
 #endif
-
 #ifndef M_LN2_F
-#define M_LN2_F   (0.6931471805599453f)  /* ln(2) */
+#  define M_LN2_F   (0.6931471805599453f)  /* ln(2) */
 #endif
-
 #ifndef M_LN10_F
-#define M_LN10_F  (2.3025850929940457f)  /* ln(10) */
+#  define M_LN10_F  (2.3025850929940457f)  /* ln(10) */
 #endif
 
 /* Scalar */
 
 #ifdef _WIN32
-
-#ifndef __KERNEL_OPENCL__
-
+#  ifndef __KERNEL_OPENCL__
 ccl_device_inline float fmaxf(float a, float b)
 {
 	return (a > b)? a: b;
@@ -95,13 +88,10 @@ ccl_device_inline float fminf(float a, float b)
 {
 	return (a < b)? a: b;
 }
-
-#endif
-
-#endif
+#  endif  /* !__KERNEL_OPENCL__ */
+#endif  /* _WIN32 */
 
 #ifndef __KERNEL_GPU__
-
 using std::isfinite;
 using std::isnan;
 
@@ -157,8 +147,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d)
 {
 	return max(max(a,b),max(c,d));
 }
-
-#endif
+#endif /* __KERNEL_GPU__ */
 
 ccl_device_inline float min4(float a, float b, float c, float d)
 {
@@ -170,525 +159,141 @@ ccl_device_inline float max4(float a, float b, float c, float d)
 	return max(max(a, b), max(c, d));
 }
 
-ccl_device_inline float max3(float3 a)
-{
-	return max(max(a.x, a.y), a.z);
-}
-
 #ifndef __KERNEL_OPENCL__
+/* Int/Float conversion */
 
-ccl_device_inline int clamp(int a, int mn, int mx)
-{
-	return min(max(a, mn), mx);
-}
-
-ccl_device_inline float clamp(float a, float mn, float mx)
-{
-	return min(max(a, mn), mx);
-}
-
-ccl_device_inline float mix(float a, float b, float t)
-{
-    return a + t*(b - a);
-}
-
-#endif
-
-#ifndef __KERNEL_CUDA__
-
-ccl_device_inline float saturate(float a)
-{
-	return clamp(a, 0.0f, 1.0f);
-}
-
-#endif
-
-ccl_device_inline int float_to_int(float f)
-{
-	return (int)f;
-}
-
-ccl_device_inline int floor_to_int(float f)
-{
-	return float_to_int(floorf(f));
-}
-
-ccl_device_inline int ceil_to_int(float f)
-{
-	return float_to_int(ceilf(f));
-}
-
-ccl_device_inline float signf(float f)
-{
-	return (f < 0.0f)? -1.0f: 1.0f;
-}
-
-ccl_device_inline float nonzerof(float f, float eps)
-{
-	if(fabsf(f) < eps)
-		return signf(f)*eps;
-	else
-		return f;
-}
-
-ccl_device_inline float smoothstepf(float f)
-{
-	float ff = f*f;
-	return (3.0f*ff - 2.0f*ff*f);
-}
-
-ccl_device_inline int mod(int x, int m)
-{
-	return (x % m + m) % m;
-}
-
-/* Float2 Vector */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool is_zero(const float2& a)
-{
-	return (a.x == 0.0f && a.y == 0.0f);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float average(const float2& a)
-{
-	return (a.x + a.y)*(1.0f/2.0f);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float2 operator-(const float2& a)
-{
-	return make_float2(-a.x, -a.y);
-}
-
-ccl_device_inline float2 operator*(const float2& a, const float2& b)
-{
-	return make_float2(a.x*b.x, a.y*b.y);
-}
-
-ccl_device_inline float2 operator*(const float2& a, float f)
-{
-	return make_float2(a.x*f, a.y*f);
-}
-
-ccl_device_inline float2 operator*(float f, const float2& a)
-{
-	return make_float2(a.x*f, a.y*f);
-}
-
-ccl_device_inline float2 operator/(float f, const float2& a)
-{
-	return make_float2(f/a.x, f/a.y);
-}
-
-ccl_device_inline float2 operator/(const float2& a, float f)
-{
-	float invf = 1.0f/f;
-	return make_float2(a.x*invf, a.y*invf);
-}
-
-ccl_device_inline float2 operator/(const float2& a, const float2& b)
+ccl_device_inline int as_int(uint i)
 {
-	return make_float2(a.x/b.x, a.y/b.y);
+	union { uint ui; int i; } u;
+	u.ui = i;
+	return u.i;
 }
 
-ccl_device_inline float2 operator+(const float2& a, const float2& b)
+ccl_device_inline uint as_uint(int i)
 {
-	return make_float2(a.x+b.x, a.y+b.y);
+	union { uint ui; int i; } u;
+	u.i = i;
+	return u.ui;
 }
 
-ccl_device_inline float2 operator-(const float2& a, const float2& b)
+ccl_device_inline uint as_uint(float f)
 {
-	return make_float2(a.x-b.x, a.y-b.y);
+	union { uint i; float f; } u;
+	u.f = f;
+	return u.i;
 }
 
-ccl_device_inline float2 operator+=(float2& a, const float2& b)
+ccl_device_inline int __float_as_int(float f)
 {
-	return a = a + b;
+	union { int i; float f; } u;
+	u.f = f;
+	return u.i;
 }
 
-ccl_device_inline float2 operator*=(float2& a, const float2& b)
+ccl_device_inline float __int_as_float(int i)
 {
-	return a = a * b;
+	union { int i; float f; } u;
+	u.i = i;
+	return u.f;
 }
 
-ccl_device_inline float2 operator*=(float2& a, float f)
+ccl_device_inline uint __float_as_uint(float f)
 {
-	return a = a * f;
+	union { uint i; float f; } u;
+	u.f = f;
+	return u.i;
 }
 
-ccl_device_inline float2 operator/=(float2& a, const float2& b)
+ccl_device_inline float __uint_as_float(uint i)
 {
-	return a = a / b;
+	union { uint i; float f; } u;
+	u.i = i;
+	return u.f;
 }
+#endif /* __KERNEL_OPENCL__ */
 
-ccl_device_inline float2 operator/=(float2& a, float f)
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
 {
-	float invf = 1.0f/f;
-	return a = a * invf;
+	unsigned int x = __float_as_uint(f);
+	return (x << 1) > 0xff000000u;
 }
 
-
-ccl_device_inline float dot(const float2& a, const float2& b)
+ccl_device_inline bool isfinite_safe(float f)
 {
-	return a.x*b.x + a.y*b.y;
+	/* By IEEE 754 rule, 2*Inf equals Inf */
+	unsigned int x = __float_as_uint(f);
+	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
 }
 
-ccl_device_inline float cross(const float2& a, const float2& b)
+ccl_device_inline float ensure_finite(float v)
 {
-	return (a.x*b.y - a.y*b.x);
+	return isfinite_safe(v)? v : 0.0f;
 }
 
-#endif
-
 #ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool operator==(const int2 a, const int2 b)
-{
-	return (a.x == b.x && a.y == b.y);
-}
-
-ccl_device_inline float len(const float2& a)
-{
-	return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float2 normalize(const float2& a)
-{
-	return a/len(a);
-}
-
-ccl_device_inline float2 normalize_len(const float2& a, float *t)
-{
-	*t = len(a);
-	return a/(*t);
-}
-
-ccl_device_inline float2 safe_normalize(const float2& a)
-{
-	float t = len(a);
-	return (t != 0.0f)? a/t: a;
-}
-
-ccl_device_inline bool operator==(const float2& a, const float2& b)
-{
-	return (a.x == b.x && a.y == b.y);
-}
-
-ccl_device_inline bool operator!=(const float2& a, const float2& b)
-{
-	return !(a == b);
-}
-
-ccl_device_inline float2 min(const float2& a, const float2& b)
-{
-	return make_float2(min(a.x, b.x), min(a.y, b.y));
-}
-
-ccl_device_inline float2 max(const float2& a, const float2& b)
-{
-	return make_float2(max(a.x, b.x), max(a.y, b.y));
-}
-
-ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
+ccl_device_inline int clamp(int a, int mn, int mx)
 {
 	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float2 fabs(const float2& a)
-{
-	return make_float2(fabsf(a.x), fabsf(a.y));
-}
-
-ccl_device_inline float2 as_float2(const float4& a)
-{
-	return make_float2(a.x, a.y);
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_float2(const char *label, const float2& a)
-{
-	printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
-{
-	return a + t*(b - a);
-}
-
-#endif
-
-/* Float3 Vector */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float3 operator-(const float3& a)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
-	return make_float3(-a.x, -a.y, -a.z);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_mul_ps(a.m128,b.m128));
-#else
-	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float3& a, const float f)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
-#else
-	return make_float3(a.x*f, a.y*f, a.z*f);
-#endif
-}
-
-ccl_device_inline float3 operator*(const float f, const float3& a)
-{
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
-	return make_float3(a.x*f, a.y*f, a.z*f);
-#endif
-}
-
-ccl_device_inline float3 operator/(const float f, const float3& a)
-{
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(a.m128);
-	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
-#else
-	return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
-}
-
-ccl_device_inline float3 operator/(const float3& a, const float f)
-{
-	float invf = 1.0f/f;
-	return a * invf;
-}
-
-ccl_device_inline float3 operator/(const float3& a, const float3& b)
-{
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(b.m128);
-	return float3(_mm_mul_ps(a, rc));
-#else
-	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
-}
-
-ccl_device_inline float3 operator+(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_add_ps(a.m128, b.m128));
-#else
-	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
-}
-
-ccl_device_inline float3 operator-(const float3& a, const float3& b)
-{
-#ifdef __KERNEL_SSE__
-	return float3(_mm_sub_ps(a.m128, b.m128));
-#else
-	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
-}
-
-ccl_device_inline float3 operator+=(float3& a, const float3& b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline float3 operator*=(float3& a, const float3& b)
-{
-	return a = a * b;
-}
-
-ccl_device_inline float3 operator*=(float3& a, float f)
-{
-	return a = a * f;
-}
-
-ccl_device_inline float3 operator/=(float3& a, const float3& b)
-{
-	return a = a / b;
-}
-
-ccl_device_inline float3 operator/=(float3& a, float f)
-{
-	float invf = 1.0f/f;
-	return a = a * invf;
-}
-
-ccl_device_inline float dot(const float3& a, const float3& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else	
-	return a.x*b.x + a.y*b.y + a.z*b.z;
-#endif
-}
-
-ccl_device_inline float dot_xy(const float3& a, const float3& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
-#else
-	return a.x*b.x + a.y*b.y;
-#endif
-}
-
-ccl_device_inline float dot(const float4& a, const float4& b)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#else	
-	return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
-#endif
-}
-
-ccl_device_inline float3 cross(const float3& a, const float3& b)
-{
-	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
-	return r;
-}
-
-#endif
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
-	return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
-	return dot(a, a);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float len_squared(const float4& a)
-{
-	return dot(a, a);
-}
-
-ccl_device_inline float3 normalize(const float3& a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
-	return _mm_div_ps(a.m128, norm);
-#else
-	return a/len(a);
-#endif
-}
-
-#endif
-
-ccl_device_inline float3 saturate3(float3 a)
+ccl_device_inline float clamp(float a, float mn, float mx)
 {
-	return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
+	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float3 normalize_len(const float3 a, float *t)
+ccl_device_inline float mix(float a, float b, float t)
 {
-	*t = len(a);
-	float x = 1.0f / *t;
-	return a*x;
+    return a + t*(b - a);
 }
+#endif  /* __KERNEL_OPENCL__ */
 
-ccl_device_inline float3 safe_normalize(const float3 a)
+#ifndef __KERNEL_CUDA__
+ccl_device_inline float saturate(float a)
 {
-	float t = len(a);
-	return (t != 0.0f)? a * (1.0f/t) : a;
+	return clamp(a, 0.0f, 1.0f);
 }
+#endif  /* __KERNEL_CUDA__ */
 
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+ccl_device_inline int float_to_int(float f)
 {
-	*t = len(a);
-	return (*t != 0.0f)? a/(*t): a;
+	return (int)f;
 }
 
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline bool operator==(const float3& a, const float3& b)
+ccl_device_inline int floor_to_int(float f)
 {
-#ifdef __KERNEL_SSE__
-	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
-	return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+	return float_to_int(floorf(f));
 }
 
-ccl_device_inline bool operator!=(const float3& a, const float3& b)
+ccl_device_inline int ceil_to_int(float f)
 {
-	return !(a == b);
+	return float_to_int(ceilf(f));
 }
 
-ccl_device_inline float3 min(const float3& a, const float3& b)
+ccl_device_inline float signf(float f)
 {
-#ifdef __KERNEL_SSE__
-	return _mm_min_ps(a.m128, b.m128);
-#else
-	return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+	return (f < 0.0f)? -1.0f: 1.0f;
 }
 
-ccl_device_inline float3 max(const float3& a, const float3& b)
+ccl_device_inline float nonzerof(float f, float eps)
 {
-#ifdef __KERNEL_SSE__
-	return _mm_max_ps(a.m128, b.m128);
-#else
-	return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+	if(fabsf(f) < eps)
+		return signf(f)*eps;
+	else
+		return f;
 }
 
-ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
+ccl_device_inline float smoothstepf(float f)
 {
-	return min(max(a, mn), mx);
+	float ff = f*f;
+	return (3.0f*ff - 2.0f*ff*f);
 }
 
-ccl_device_inline float3 fabs(const float3& a)
+ccl_device_inline int mod(int x, int m)
 {
-#ifdef __KERNEL_SSE__
-	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-	return _mm_and_ps(a.m128, mask);
-#else
-	return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+	return (x % m + m) % m;
 }
 
-#endif
-
 ccl_device_inline float3 float2_to_float3(const float2 a)
 {
 	return make_float3(a.x, a.y, 0.0f);
@@ -704,546 +309,19 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
 	return make_float4(a.x, a.y, a.z, 1.0f);
 }
 
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_float3(const char *label, const float3& a)
-{
-	printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
-}
-
-ccl_device_inline float3 rcp(const float3& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 r = _mm_rcp_ps(a.m128);
-	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#else
-	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
-#endif
-}
-
-#endif
-
-ccl_device_inline float3 interp(float3 a, float3 b, float t)
-{
-	return a + t*(b - a);
-}
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
-{
-	return a + t*(b - a);
-}
-
-#endif
-
-ccl_device_inline bool is_zero(const float3 a)
-{
-#ifdef __KERNEL_SSE__
-	return a == make_float3(0.0f);
-#else
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
-#endif
-}
-
-ccl_device_inline float reduce_add(const float3 a)
-{
-	return (a.x + a.y + a.z);
-}
-
-ccl_device_inline float average(const float3 a)
-{
-	return reduce_add(a)*(1.0f/3.0f);
-}
-
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
-{
-#ifdef __KERNEL_OPENCL__
-	return all(a == b);
-#else
-	return a == b;
-#endif
-}
-
-/* Float4 Vector */
-
-#ifdef __KERNEL_SSE__
-
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b)
-{
-	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
-}
-
-#if defined(__KERNEL_SSE3__)
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
-{
-	return _mm_moveldup_ps(b);
-}
-
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
-{
-	return _mm_movehdup_ps(b);
-}
-#endif
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
-{
-	return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)));
-}
-
-#endif
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline float4 operator-(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-	return _mm_xor_ps(a.m128, mask);
-#else
-	return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
-}
-
-ccl_device_inline float4 operator*(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_mul_ps(a.m128, b.m128);
-#else
-	return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
-#endif
-}
-
-ccl_device_inline float4 operator*(const float4& a, float f)
-{
-#if defined(__KERNEL_SSE__)
-	return a * make_float4(f);
-#else
-	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
-#endif
-}
-
-ccl_device_inline float4 operator*(float f, const float4& a)
-{
-	return a * f;
-}
-
-ccl_device_inline float4 rcp(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 r = _mm_rcp_ps(a.m128);
-	return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#else
-	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
-#endif
-}
-
-ccl_device_inline float4 operator/(const float4& a, float f)
-{
-	return a * (1.0f/f);
-}
-
-ccl_device_inline float4 operator/(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return a * rcp(b);
-#else
-	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
-#endif
-
-}
-
-ccl_device_inline float4 operator+(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_add_ps(a.m128, b.m128);
-#else
-	return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-#endif
-}
-
-ccl_device_inline float4 operator-(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_sub_ps(a.m128, b.m128);
-#else
-	return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
-#endif
-}
-
-ccl_device_inline float4 operator+=(float4& a, const float4& b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline float4 operator*=(float4& a, const float4& b)
-{
-	return a = a * b;
-}
-
-ccl_device_inline float4 operator/=(float4& a, float f)
-{
-	return a = a / f;
-}
-
-ccl_device_inline int4 operator<(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
-	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
-}
-
-ccl_device_inline int4 operator>=(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
-	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
-}
-
-ccl_device_inline int4 operator<=(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */
-#else
-	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
-}
-
-ccl_device_inline bool operator==(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
-	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
-}
-
-ccl_device_inline float4 cross(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
-#else
-	return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f);
-#endif
-}
-
-ccl_device_inline bool is_zero(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	return a == make_float4(0.0f);
-#else
-	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
-}
-
-ccl_device_inline float reduce_add(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */
-#else
-	return ((a.x + a.y) + (a.z + a.w));
-#endif
-}
-
-ccl_device_inline float average(const float4& a)
-{
-	return reduce_add(a) * 0.25f;
-}
-
-ccl_device_inline float len(const float4& a)
-{
-	return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float4 normalize(const float4& a)
-{
-	return a/len(a);
-}
-
-ccl_device_inline float4 safe_normalize(const float4& a)
-{
-	float t = len(a);
-	return (t != 0.0f)? a/t: a;
-}
-
-ccl_device_inline float4 min(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_min_ps(a.m128, b.m128);
-#else
-	return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
-}
-
-ccl_device_inline float4 max(const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_max_ps(a.m128, b.m128);
-#else
-	return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
-#else
-	return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
-#endif
-}
-
-ccl_device_inline float4 reduce_min(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = min(shuffle<1,0,3,2>(a), a);
-	return min(shuffle<2,3,0,1>(h), h);
-#else
-	return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-#endif
-}
-
-ccl_device_inline float4 reduce_max(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = max(shuffle<1,0,3,2>(a), a);
-	return max(shuffle<2,3,0,1>(h), h);
-#else
-	return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-#endif
-}
-
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
-{
-#ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return shuffle<2,3,0,1>(h) + h;
-#else
-	return make_float4((a.x + a.y) + (a.z + a.w));
-#endif
-}
-#endif
-
-ccl_device_inline void print_float4(const char *label, const float4& a)
-{
-	printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
-}
-
-#endif
-
-/* Int2 */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x + b.x, a.y + b.y);
-}
-
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x - b.x, a.y - b.y);
-}
-
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x * b.x, a.y * b.y);
-}
-
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
-{
-	return make_int2(a.x / b.x, a.y / b.y);
-}
-
-#endif
-
-/* Int3 */
-
-#ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int3 min(int3 a, int3 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_min_epi32(a.m128, b.m128);
-#else
-	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
-}
-
-ccl_device_inline int3 max(int3 a, int3 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_max_epi32(a.m128, b.m128);
-#else
-	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
-}
-
-ccl_device_inline int3 clamp(const int3& a, int mn, int mx)
-{
-#ifdef __KERNEL_SSE__
-	return min(max(a, make_int3(mn)), make_int3(mx));
-#else
-	return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
-}
-
-ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx)
-{
-#ifdef __KERNEL_SSE__
-	return min(max(a, mn), make_int3(mx));
-#else
-	return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
-}
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline void print_int3(const char *label, const int3& a)
-{
-	printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
-}
-
-#endif
-
-/* Int4 */
-
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline int4 operator+(const int4& a, const int4& b)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_add_epi32(a.m128, b.m128);
-#else
-	return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-#endif
-}
-
-ccl_device_inline int4 operator+=(int4& a, const int4& b)
-{
-	return a = a + b;
-}
-
-ccl_device_inline int4 operator>>(const int4& a, int i)
-{
-#ifdef __KERNEL_SSE__
-	return _mm_srai_epi32(a.m128, i);
-#else
-	return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
-#endif
-}
-
-ccl_device_inline int4 min(int4 a, int4 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_min_epi32(a.m128, b.m128);
-#else
-	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
-}
-
-ccl_device_inline int4 max(int4 a, int4 b)
-{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
-	return _mm_max_epi32(a.m128, b.m128);
-#else
-	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
-}
-
-ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
-{
-	return min(max(a, mn), mx);
-}
-
-ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
-{
-#ifdef __KERNEL_SSE__
-	__m128 m = _mm_cvtepi32_ps(mask);
-	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */
-#else
-	return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
-#endif
-}
+CCL_NAMESPACE_END
 
-ccl_device_inline void print_int4(const char *label, const int4& a)
-{
-	printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
-}
+#include "util/util_math_int2.h"
+#include "util/util_math_int3.h"
+#include "util/util_math_int4.h"
 
-#endif
+#include "util/util_math_float2.h"
+#include "util/util_math_float3.h"
+#include "util/util_math_float4.h"
 
-/* Int/Float conversion */
+CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPENCL__
-
-ccl_device_inline int as_int(uint i)
-{
-	union { uint ui; int i; } u;
-	u.ui = i;
-	return u.i;
-}
-
-ccl_device_inline uint as_uint(int i)
-{
-	union { uint ui; int i; } u;
-	u.i = i;
-	return u.ui;
-}
-
-ccl_device_inline uint as_uint(float f)
-{
-	union { uint i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline int __float_as_int(float f)
-{
-	union { int i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline float __int_as_float(int i)
-{
-	union { int i; float f; } u;
-	u.i = i;
-	return u.f;
-}
-
-ccl_device_inline uint __float_as_uint(float f)
-{
-	union { uint i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline float __uint_as_float(uint i)
-{
-	union { uint i; float f; } u;
-	u.i = i;
-	return u.f;
-}
-
-
 /* Interpolation */
 
 template<class A, class B> A lerp(const A& a, const A& b, const B& t)
@@ -1253,26 +331,13 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
 
 /* Triangle */
 
-ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
+ccl_device_inline float triangle_area(const float3& v1,
+                                      const float3& v2,
+                                      const float3& v3)
 {
 	return len(cross(v3 - v2, v1 - v2))*0.5f;
 }
-
-#endif
-
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
-	unsigned int x = __float_as_uint(f);
-	return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
-	/* By IEEE 754 rule, 2*Inf equals Inf */
-	unsigned int x = __float_as_uint(f);
-	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
-}
+#endif  /* __KERNEL_OPENCL__ */
 
 /* Orthonormal vectors */
 
@@ -1369,16 +434,16 @@ ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle)
 	float3 r;
 
 	r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) +
-		(((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
-		(((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
+	      (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) +
+	      (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z);
 
 	r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) +
-		((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
-		(((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
+	      ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) +
+	     (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z);
 
 	r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) +
-		(((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
-		((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
+	      (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) +
+	      ((costheta + (1 - costheta) * axis.z * axis.z) * p.z);
 
 	return r;
 }
@@ -1427,17 +492,17 @@ ccl_device float safe_powf(float a, float b)
 	return compatible_powf(a, b);
 }
 
-ccl_device float safe_logf(float a, float b)
+ccl_device float safe_divide(float a, float b)
 {
-	if(UNLIKELY(a < 0.0f || b < 0.0f))
-		return 0.0f;
-
-	return logf(a)/logf(b);
+	return (b != 0.0f)? a/b: 0.0f;
 }
 
-ccl_device float safe_divide(float a, float b)
+ccl_device float safe_logf(float a, float b)
 {
-	return (b != 0.0f)? a/b: 0.0f;
+	if(UNLIKELY(a <= 0.0f || b <= 0.0f))
+		return 0.0f;
+
+	return safe_divide(logf(a),logf(b));
 }
 
 ccl_device float safe_modulo(float a, float b)
@@ -1493,31 +558,6 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
 	return make_float2(u, v);
 }
 
-ccl_device_inline int util_max_axis(float3 vec)
-{
-#ifdef __KERNEL_SSE__
-	__m128 a = shuffle<0,0,1,1>(vec.m128);
-	__m128 b = shuffle<1,2,2,1>(vec.m128);
-	__m128 c = _mm_cmpgt_ps(a, b);
-	int mask = _mm_movemask_ps(c) & 0x7;
-	static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0};
-	return tab[mask];
-#else
-	if(vec.x > vec.y) {
-		if(vec.x > vec.z)
-			return 0;
-		else
-			return 2;
-	}
-	else {
-		if(vec.y > vec.z)
-			return 1;
-		else
-			return 2;
-	}
-#endif
-}
-
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
new file mode 100644
index 00000000000..6f9d0855d50
--- /dev/null
+++ b/intern/cycles/util/util_math_float2.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT2_H__
+#define __UTIL_MATH_FLOAT2_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float2 operator-(const float2& a);
+ccl_device_inline float2 operator*(const float2& a, const float2& b);
+ccl_device_inline float2 operator*(const float2& a, float f);
+ccl_device_inline float2 operator*(float f, const float2& a);
+ccl_device_inline float2 operator/(float f, const float2& a);
+ccl_device_inline float2 operator/(const float2& a, float f);
+ccl_device_inline float2 operator/(const float2& a, const float2& b);
+ccl_device_inline float2 operator+(const float2& a, const float2& b);
+ccl_device_inline float2 operator-(const float2& a, const float2& b);
+ccl_device_inline float2 operator+=(float2& a, const float2& b);
+ccl_device_inline float2 operator*=(float2& a, const float2& b);
+ccl_device_inline float2 operator*=(float2& a, float f);
+ccl_device_inline float2 operator/=(float2& a, const float2& b);
+ccl_device_inline float2 operator/=(float2& a, float f);
+
+ccl_device_inline bool operator==(const float2& a, const float2& b);
+ccl_device_inline bool operator!=(const float2& a, const float2& b);
+
+ccl_device_inline bool is_zero(const float2& a);
+ccl_device_inline float average(const float2& a);
+ccl_device_inline float dot(const float2& a, const float2& b);
+ccl_device_inline float cross(const float2& a, const float2& b);
+ccl_device_inline float len(const float2& a);
+ccl_device_inline float2 normalize(const float2& a);
+ccl_device_inline float2 normalize_len(const float2& a, float *t);
+ccl_device_inline float2 safe_normalize(const float2& a);
+ccl_device_inline float2 min(const float2& a, const float2& b);
+ccl_device_inline float2 max(const float2& a, const float2& b);
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx);
+ccl_device_inline float2 fabs(const float2& a);
+ccl_device_inline float2 as_float2(const float4& a);
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t);
+#endif  /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float2 operator-(const float2& a)
+{
+	return make_float2(-a.x, -a.y);
+}
+
+ccl_device_inline float2 operator*(const float2& a, const float2& b)
+{
+	return make_float2(a.x*b.x, a.y*b.y);
+}
+
+ccl_device_inline float2 operator*(const float2& a, float f)
+{
+	return make_float2(a.x*f, a.y*f);
+}
+
+ccl_device_inline float2 operator*(float f, const float2& a)
+{
+	return make_float2(a.x*f, a.y*f);
+}
+
+ccl_device_inline float2 operator/(float f, const float2& a)
+{
+	return make_float2(f/a.x, f/a.y);
+}
+
+ccl_device_inline float2 operator/(const float2& a, float f)
+{
+	float invf = 1.0f/f;
+	return make_float2(a.x*invf, a.y*invf);
+}
+
+ccl_device_inline float2 operator/(const float2& a, const float2& b)
+{
+	return make_float2(a.x/b.x, a.y/b.y);
+}
+
+ccl_device_inline float2 operator+(const float2& a, const float2& b)
+{
+	return make_float2(a.x+b.x, a.y+b.y);
+}
+
+ccl_device_inline float2 operator-(const float2& a, const float2& b)
+{
+	return make_float2(a.x-b.x, a.y-b.y);
+}
+
+ccl_device_inline float2 operator+=(float2& a, const float2& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline float2 operator*=(float2& a, const float2& b)
+{
+	return a = a * b;
+}
+
+ccl_device_inline float2 operator*=(float2& a, float f)
+{
+	return a = a * f;
+}
+
+ccl_device_inline float2 operator/=(float2& a, const float2& b)
+{
+	return a = a / b;
+}
+
+ccl_device_inline float2 operator/=(float2& a, float f)
+{
+	float invf = 1.0f/f;
+	return a = a * invf;
+}
+
+ccl_device_inline bool operator==(const float2& a, const float2& b)
+{
+	return (a.x == b.x && a.y == b.y);
+}
+
+ccl_device_inline bool operator!=(const float2& a, const float2& b)
+{
+	return !(a == b);
+}
+
+ccl_device_inline bool is_zero(const float2& a)
+{
+	return (a.x == 0.0f && a.y == 0.0f);
+}
+
+ccl_device_inline float average(const float2& a)
+{
+	return (a.x + a.y)*(1.0f/2.0f);
+}
+
+ccl_device_inline float dot(const float2& a, const float2& b)
+{
+	return a.x*b.x + a.y*b.y;
+}
+
+ccl_device_inline float cross(const float2& a, const float2& b)
+{
+	return (a.x*b.y - a.y*b.x);
+}
+
+ccl_device_inline float len(const float2& a)
+{
+	return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float2 normalize(const float2& a)
+{
+	return a/len(a);
+}
+
+ccl_device_inline float2 normalize_len(const float2& a, float *t)
+{
+	*t = len(a);
+	return a/(*t);
+}
+
+ccl_device_inline float2 safe_normalize(const float2& a)
+{
+	float t = len(a);
+	return (t != 0.0f)? a/t: a;
+}
+
+ccl_device_inline float2 min(const float2& a, const float2& b)
+{
+	return make_float2(min(a.x, b.x), min(a.y, b.y));
+}
+
+ccl_device_inline float2 max(const float2& a, const float2& b)
+{
+	return make_float2(max(a.x, b.x), max(a.y, b.y));
+}
+
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+ccl_device_inline float2 fabs(const float2& a)
+{
+	return make_float2(fabsf(a.x), fabsf(a.y));
+}
+
+ccl_device_inline float2 as_float2(const float4& a)
+{
+	return make_float2(a.x, a.y);
+}
+
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
+{
+	return a + t*(b - a);
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
new file mode 100644
index 00000000000..bb04c4aa2d9
--- /dev/null
+++ b/intern/cycles/util/util_math_float3.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT3_H__
+#define __UTIL_MATH_FLOAT3_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float3 operator-(const float3& a);
+ccl_device_inline float3 operator*(const float3& a, const float3& b);
+ccl_device_inline float3 operator*(const float3& a, const float f);
+ccl_device_inline float3 operator*(const float f, const float3& a);
+ccl_device_inline float3 operator/(const float f, const float3& a);
+ccl_device_inline float3 operator/(const float3& a, const float f);
+ccl_device_inline float3 operator/(const float3& a, const float3& b);
+ccl_device_inline float3 operator+(const float3& a, const float3& b);
+ccl_device_inline float3 operator-(const float3& a, const float3& b);
+ccl_device_inline float3 operator+=(float3& a, const float3& b);
+ccl_device_inline float3 operator-=(float3& a, const float3& b);
+ccl_device_inline float3 operator*=(float3& a, const float3& b);
+ccl_device_inline float3 operator*=(float3& a, float f);
+ccl_device_inline float3 operator/=(float3& a, const float3& b);
+ccl_device_inline float3 operator/=(float3& a, float f);
+
+ccl_device_inline bool operator==(const float3& a, const float3& b);
+ccl_device_inline bool operator!=(const float3& a, const float3& b);
+
+ccl_device_inline float dot(const float3& a, const float3& b);
+ccl_device_inline float dot_xy(const float3& a, const float3& b);
+ccl_device_inline float3 cross(const float3& a, const float3& b);
+ccl_device_inline float3 normalize(const float3& a);
+ccl_device_inline float3 min(const float3& a, const float3& b);
+ccl_device_inline float3 max(const float3& a, const float3& b);
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx);
+ccl_device_inline float3 fabs(const float3& a);
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t);
+ccl_device_inline float3 rcp(const float3& a);
+#endif  /* !__KERNEL_OPENCL__ */
+
+ccl_device_inline float max3(float3 a);
+ccl_device_inline float len(const float3 a);
+ccl_device_inline float len_squared(const float3 a);
+
+ccl_device_inline float3 saturate3(float3 a);
+ccl_device_inline float3 safe_normalize(const float3 a);
+ccl_device_inline float3 normalize_len(const float3 a, float *t);;
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 interp(float3 a, float3 b, float t);
+
+ccl_device_inline bool is_zero(const float3 a);
+ccl_device_inline float reduce_add(const float3 a);
+ccl_device_inline float average(const float3 a);
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float3 operator-(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
+	return make_float3(-a.x, -a.y, -a.z);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
+	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float3& a, const float f)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
+	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
+}
+
+ccl_device_inline float3 operator*(const float f, const float3& a)
+{
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
+#else
+	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
+}
+
+ccl_device_inline float3 operator/(const float f, const float3& a)
+{
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	__m128 rc = _mm_rcp_ps(a.m128);
+	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
+}
+
+ccl_device_inline float3 operator/(const float3& a, const float f)
+{
+	float invf = 1.0f/f;
+	return a * invf;
+}
+
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
+{
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	__m128 rc = _mm_rcp_ps(b.m128);
+	return float3(_mm_mul_ps(a, rc));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
+}
+
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
+}
+
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
+}
+
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline float3 operator-=(float3& a, const float3& b)
+{
+	return a = a - b;
+}
+
+ccl_device_inline float3 operator*=(float3& a, const float3& b)
+{
+	return a = a * b;
+}
+
+ccl_device_inline float3 operator*=(float3& a, float f)
+{
+	return a = a * f;
+}
+
+ccl_device_inline float3 operator/=(float3& a, const float3& b)
+{
+	return a = a / b;
+}
+
+ccl_device_inline float3 operator/=(float3& a, float f)
+{
+	float invf = 1.0f/f;
+	return a = a * invf;
+}
+
+ccl_device_inline bool operator==(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
+#else
+	return (a.x == b.x && a.y == b.y && a.z == b.z);
+#endif
+}
+
+ccl_device_inline bool operator!=(const float3& a, const float3& b)
+{
+	return !(a == b);
+}
+
+ccl_device_inline float dot(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
+#else
+	return a.x*b.x + a.y*b.y + a.z*b.z;
+#endif
+}
+
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
+ccl_device_inline float3 cross(const float3& a, const float3& b)
+{
+	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+	return r;
+}
+
+ccl_device_inline float3 normalize(const float3& a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
+	return float3(_mm_div_ps(a.m128, norm));
+#else
+	return a/len(a);
+#endif
+}
+
+ccl_device_inline float3 min(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_min_ps(a.m128, b.m128));
+#else
+	return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
+ccl_device_inline float3 max(const float3& a, const float3& b)
+{
+#ifdef __KERNEL_SSE__
+	return float3(_mm_max_ps(a.m128, b.m128));
+#else
+	return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
+}
+
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+ccl_device_inline float3 fabs(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+	return float3(_mm_and_ps(a.m128, mask));
+#else
+	return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
+}
+
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
+{
+	return a + t*(b - a);
+}
+
+ccl_device_inline float3 rcp(const float3& a)
+{
+#ifdef __KERNEL_SSE__
+	const float4 r(_mm_rcp_ps(a.m128));
+	return float3(_mm_sub_ps(_mm_add_ps(r, r),
+	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+#else
+	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
+#endif
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+ccl_device_inline float max3(float3 a)
+{
+	return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+	return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+	return dot(a, a);
+}
+
+ccl_device_inline float3 saturate3(float3 a)
+{
+	return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
+}
+
+ccl_device_inline float3 normalize_len(const float3 a, float *t)
+{
+	*t = len(a);
+	float x = 1.0f / *t;
+	return a*x;
+}
+
+ccl_device_inline float3 safe_normalize(const float3 a)
+{
+	float t = len(a);
+	return (t != 0.0f)? a * (1.0f/t) : a;
+}
+
+ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
+{
+	*t = len(a);
+	return (*t != 0.0f)? a/(*t): a;
+}
+
+ccl_device_inline float3 interp(float3 a, float3 b, float t)
+{
+	return a + t*(b - a);
+}
+
+ccl_device_inline bool is_zero(const float3 a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float3(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f);
+#endif
+}
+
+ccl_device_inline float reduce_add(const float3 a)
+{
+	return (a.x + a.y + a.z);
+}
+
+ccl_device_inline float average(const float3 a)
+{
+	return reduce_add(a)*(1.0f/3.0f);
+}
+
+ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
+{
+#ifdef __KERNEL_OPENCL__
+	return all(a == b);
+#else
+	return a == b;
+#endif
+}
+
+ccl_device_inline bool isfinite3_safe(float3 v)
+{
+	return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
+}
+
+ccl_device_inline float3 ensure_finite3(float3 v)
+{
+	if(!isfinite_safe(v.x)) v.x = 0.0f;
+	if(!isfinite_safe(v.y)) v.y = 0.0f;
+	if(!isfinite_safe(v.z)) v.z = 0.0f;
+	return v;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
new file mode 100644
index 00000000000..d89121b3a1d
--- /dev/null
+++ b/intern/cycles/util/util_math_float4.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_FLOAT4_H__
+#define __UTIL_MATH_FLOAT4_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float4 operator-(const float4& a);
+ccl_device_inline float4 operator*(const float4& a, const float4& b);
+ccl_device_inline float4 operator*(const float4& a, float f);
+ccl_device_inline float4 operator*(float f, const float4& a);
+ccl_device_inline float4 operator/(const float4& a, float f);
+ccl_device_inline float4 operator/(const float4& a, const float4& b);
+ccl_device_inline float4 operator+(const float4& a, const float4& b);
+ccl_device_inline float4 operator-(const float4& a, const float4& b);
+ccl_device_inline float4 operator+=(float4& a, const float4& b);
+ccl_device_inline float4 operator*=(float4& a, const float4& b);
+ccl_device_inline float4 operator/=(float4& a, float f);
+
+ccl_device_inline int4 operator<(const float4& a, const float4& b);
+ccl_device_inline int4 operator>=(const float4& a, const float4& b);
+ccl_device_inline int4 operator<=(const float4& a, const float4& b);
+ccl_device_inline bool operator==(const float4& a, const float4& b);
+
+ccl_device_inline float dot(const float4& a, const float4& b);
+ccl_device_inline float len_squared(const float4& a);
+ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 cross(const float4& a, const float4& b);
+ccl_device_inline bool is_zero(const float4& a);
+ccl_device_inline float reduce_add(const float4& a);
+ccl_device_inline float average(const float4& a);
+ccl_device_inline float len(const float4& a);
+ccl_device_inline float4 normalize(const float4& a);
+ccl_device_inline float4 safe_normalize(const float4& a);
+ccl_device_inline float4 min(const float4& a, const float4& b);
+ccl_device_inline float4 max(const float4& a, const float4& b);
+#endif  /* !__KERNEL_OPENCL__*/
+
+#ifdef __KERNEL_SSE__
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& b);
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
+
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
+#  endif
+#endif  /* __KERNEL_SSE__ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline float4 select(const int4& mask,
+                                const float4& a,
+                                const float4& b);
+ccl_device_inline float4 reduce_min(const float4& a);
+ccl_device_inline float4 reduce_max(const float4& a);
+#  if 0
+ccl_device_inline float4 reduce_add(const float4& a);
+#  endif
+#endif  /* !__KERNEL_GPU__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline float4 operator-(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+	return float4(_mm_xor_ps(a.m128, mask));
+#else
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+#endif
+}
+
+ccl_device_inline float4 operator*(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_mul_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+#endif
+}
+
+ccl_device_inline float4 operator*(const float4& a, float f)
+{
+#if defined(__KERNEL_SSE__)
+	return a * make_float4(f);
+#else
+	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
+#endif
+}
+
+ccl_device_inline float4 operator*(float f, const float4& a)
+{
+	return a * f;
+}
+
+ccl_device_inline float4 operator/(const float4& a, float f)
+{
+	return a * (1.0f/f);
+}
+
+ccl_device_inline float4 operator/(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return a * rcp(b);
+#else
+	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+#endif
+
+}
+
+ccl_device_inline float4 operator+(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+ccl_device_inline float4 operator-(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+#endif
+}
+
+ccl_device_inline float4 operator+=(float4& a, const float4& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline float4 operator*=(float4& a, const float4& b)
+{
+	return a = a * b;
+}
+
+ccl_device_inline float4 operator/=(float4& a, float f)
+{
+	return a = a / f;
+}
+
+ccl_device_inline int4 operator<(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	/* TODO(sergey): avoid cvt. */
+	return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)));
+#else
+	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+ccl_device_inline int4 operator>=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	/* TODO(sergey): avoid cvt. */
+	return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)));
+#else
+	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+ccl_device_inline int4 operator<=(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	/* TODO(sergey): avoid cvt. */
+	return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)));
+#else
+	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
+#endif
+}
+
+ccl_device_inline bool operator==(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
+#else
+	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+#endif
+}
+
+ccl_device_inline float dot(const float4& a, const float4& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#else
+	return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
+#endif
+}
+
+ccl_device_inline float len_squared(const float4& a)
+{
+	return dot(a, a);
+}
+
+ccl_device_inline float4 rcp(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 r(_mm_rcp_ps(a.m128));
+	return float4(_mm_sub_ps(_mm_add_ps(r, r),
+	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+#else
+	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
+#endif
+}
+
+ccl_device_inline float4 cross(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) -
+	       (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));
+#else
+	return make_float4(a.y*b.z - a.z*b.y,
+	                   a.z*b.x - a.x*b.z,
+	                   a.x*b.y - a.y*b.x,
+	                   0.0f);
+#endif
+}
+
+ccl_device_inline bool is_zero(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return a == make_float4(0.0f);
+#else
+	return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#endif
+}
+
+ccl_device_inline float reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h(shuffle<1,0,3,2>(a) + a);
+	/* TODO(sergey): Investigate efficiency. */
+	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+#else
+	return ((a.x + a.y) + (a.z + a.w));
+#endif
+}
+
+ccl_device_inline float average(const float4& a)
+{
+	return reduce_add(a) * 0.25f;
+}
+
+ccl_device_inline float len(const float4& a)
+{
+	return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float4 normalize(const float4& a)
+{
+	return a/len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4& a)
+{
+	float t = len(a);
+	return (t != 0.0f)? a/t: a;
+}
+
+ccl_device_inline float4 min(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_min_ps(a.m128, b.m128));
+#else
+	return make_float4(min(a.x, b.x),
+	                   min(a.y, b.y),
+	                   min(a.z, b.z),
+	                   min(a.w, b.w));
+#endif
+}
+
+ccl_device_inline float4 max(const float4& a, const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_max_ps(a.m128, b.m128));
+#else
+	return make_float4(max(a.x, b.x),
+	                   max(a.y, b.y),
+	                   max(a.z, b.z),
+	                   max(a.w, b.w));
+#endif
+}
+#endif  /* !__KERNEL_OPENCL__*/
+
+#ifdef __KERNEL_SSE__
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& b)
+{
+	return float4(_mm_castsi128_ps(
+	        _mm_shuffle_epi32(_mm_castps_si128(b),
+	                          _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
+{
+	return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+}
+
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
+{
+	return float4(_mm_moveldup_ps(b));
+}
+
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
+{
+	return float4(_mm_movehdup_ps(b));
+}
+#  endif  /* __KERNEL_SSE3__ */
+#endif  /* __KERNEL_SSE__ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline float4 select(const int4& mask,
+                                const float4& a,
+                                const float4& b)
+{
+#ifdef __KERNEL_SSE__
+	/* TODO(sergey): avoid cvt. */
+	return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
+	                        _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+#else
+	return make_float4((mask.x)? a.x: b.x,
+	                   (mask.y)? a.y: b.y,
+	                   (mask.z)? a.z: b.z,
+	                   (mask.w)? a.w: b.w);
+#endif
+}
+
+ccl_device_inline float4 reduce_min(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = min(shuffle<1,0,3,2>(a), a);
+	return min(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
+#endif
+}
+
+ccl_device_inline float4 reduce_max(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = max(shuffle<1,0,3,2>(a), a);
+	return max(shuffle<2,3,0,1>(h), h);
+#else
+	return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
+#endif
+}
+
+#if 0
+ccl_device_inline float4 reduce_add(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	float4 h = shuffle<1,0,3,2>(a) + a;
+	return shuffle<2,3,0,1>(h) + h;
+#else
+	return make_float4((a.x + a.y) + (a.z + a.w));
+#endif
+}
+#endif
+#endif  /* !__KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
new file mode 100644
index 00000000000..828c49a131c
--- /dev/null
+++ b/intern/cycles/util/util_math_int2.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT2_H__
+#define __UTIL_MATH_INT2_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline bool operator==(const int2 a, const int2 b);
+ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
+ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
+ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif  /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline bool operator==(const int2 a, const int2 b)
+{
+	return (a.x == b.x && a.y == b.y);
+}
+
+ccl_device_inline int2 operator+(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x + b.x, a.y + b.y);
+}
+
+ccl_device_inline int2 operator+=(int2 &a, const int2 &b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline int2 operator-(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x - b.x, a.y - b.y);
+}
+
+ccl_device_inline int2 operator*(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x * b.x, a.y * b.y);
+}
+
+ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
+{
+	return make_int2(a.x / b.x, a.y / b.y);
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT2_H__ */
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
new file mode 100644
index 00000000000..fa7a02636de
--- /dev/null
+++ b/intern/cycles/util/util_math_int3.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT3_H__
+#define __UTIL_MATH_INT3_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline int3 min(int3 a, int3 b);
+ccl_device_inline int3 max(int3 a, int3 b);
+ccl_device_inline int3 clamp(const int3& a, int mn, int mx);
+ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx);
+#endif  /* !__KERNEL_OPENCL__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_OPENCL__
+ccl_device_inline int3 min(int3 a, int3 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int3(_mm_min_epi32(a.m128, b.m128));
+#else
+	return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+#endif
+}
+
+ccl_device_inline int3 max(int3 a, int3 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int3(_mm_max_epi32(a.m128, b.m128));
+#else
+	return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+#endif
+}
+
+ccl_device_inline int3 clamp(const int3& a, int mn, int mx)
+{
+#ifdef __KERNEL_SSE__
+	return min(max(a, make_int3(mn)), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
+#endif
+}
+
+ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx)
+{
+#ifdef __KERNEL_SSE__
+	return min(max(a, mn), make_int3(mx));
+#else
+	return make_int3(clamp(a.x, mn.x, mx),
+	                 clamp(a.y, mn.y, mx),
+	                 clamp(a.z, mn.z, mx));
+#endif
+}
+#endif  /* !__KERNEL_OPENCL__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT3_H__ */
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
new file mode 100644
index 00000000000..79a8c0841e7
--- /dev/null
+++ b/intern/cycles/util/util_math_int4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INT4_H__
+#define __UTIL_MATH_INT4_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline int4 operator+(const int4& a, const int4& b);
+ccl_device_inline int4 operator+=(int4& a, const int4& b);
+ccl_device_inline int4 operator>>(const int4& a, int i);
+ccl_device_inline int4 min(int4 a, int4 b);
+ccl_device_inline int4 max(int4 a, int4 b);
+ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
+ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b);
+#endif  /* __KERNEL_GPU__ */
+
+/*******************************************************************************
+ * Definition.
+ */
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline int4 operator+(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_add_epi32(a.m128, b.m128));
+#else
+	return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+#endif
+}
+
+ccl_device_inline int4 operator+=(int4& a, const int4& b)
+{
+	return a = a + b;
+}
+
+ccl_device_inline int4 operator>>(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_srai_epi32(a.m128, i));
+#else
+	return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
+#endif
+}
+
+ccl_device_inline int4 min(int4 a, int4 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int4(_mm_min_epi32(a.m128, b.m128));
+#else
+	return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+#endif
+}
+
+ccl_device_inline int4 max(int4 a, int4 b)
+{
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+	return int4(_mm_max_epi32(a.m128, b.m128));
+#else
+	return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+#endif
+}
+
+ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx)
+{
+	return min(max(a, mn), mx);
+}
+
+ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	const __m128 m = _mm_cvtepi32_ps(mask);
+	/* TODO(sergey): avoid cvt. */
+	return int4(_mm_castps_si128(
+	        _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)),
+	                  _mm_andnot_ps(m, _mm_castsi128_ps(b)))));
+#else
+	return make_int4((mask.x)? a.x: b.x,
+	                 (mask.y)? a.y: b.y,
+	                 (mask.z)? a.z: b.z,
+	                 (mask.w)? a.w: b.w);
+#endif
+}
+
+ccl_device_inline int4 load_int4(const int *v)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_loadu_si128((__m128i*)v));
+#else
+	return make_int4(v[0], v[1], v[2], v[3]);
+#endif
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
new file mode 100644
index 00000000000..c7511f8306e
--- /dev/null
+++ b/intern/cycles/util/util_math_matrix.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_MATRIX_H__
+#define __UTIL_MATH_MATRIX_H__
+
+CCL_NAMESPACE_BEGIN
+
+#define MAT(A, size, row, col) A[(row)*(size)+(col)]
+
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+#  define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+#  define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+#  define VECS(V, i, s) V[(i)*(s)]
+#else
+#  define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#  define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+#  define VECS(V, i, s) V[i]
+#endif
+
+/* Zeroing helpers. */
+
+ccl_device_inline void math_vector_zero(float *v, int n)
+{
+	for(int i = 0; i < n; i++) {
+		v[i] = 0.0f;
+	}
+}
+
+ccl_device_inline void math_matrix_zero(float *A, int n)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = 0.0f;
+		}
+	}
+}
+
+/* Elementary vector operations. */
+
+ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] += b[i];
+	}
+}
+
+ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] *= b[i];
+	}
+}
+
+ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i*astride] *= b[i];
+	}
+}
+
+ccl_device_inline void math_vector_scale(float *a, float b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] *= b;
+	}
+}
+
+ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] = max(a[i], b[i]);
+	}
+}
+
+ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
+{
+	for(int i = 0; i < n; i++) {
+		v[i] += w*x[i];
+	}
+}
+
+ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
+{
+	for(int i = 0; i < n; i++) {
+		v[i*stride] += w*x[i];
+	}
+}
+
+/* Elementary matrix operations.
+ * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
+
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+{
+	for(int row = 0; row < n; row++) {
+		MATHS(A, row, row, stride) += val;
+	}
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
+                                                  int n,
+                                                  const float *ccl_restrict v,
+                                                  float weight)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) += v[row]*v[col]*weight;
+		}
+	}
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
+                                                          int n,
+                                                          const float *ccl_restrict v,
+                                                          float weight,
+                                                          int stride)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+		}
+	}
+}
+
+/* Transpose matrix A inplace. */
+ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
+{
+	for(int i = 0; i < n; i++) {
+		for(int j = 0; j < i; j++) {
+			float temp = MATS(A, n, i, j, stride);
+			MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
+			MATS(A, n, j, i, stride) = temp;
+		}
+	}
+}
+
+/* Solvers for matrix problems */
+
+/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
+ * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
+ * Also, only the lower triangular part of A is ever accessed. */
+ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			float sum_col = MATHS(A, row, col, stride);
+			for(int k = 0; k < col; k++) {
+				sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
+			}
+			if(row == col) {
+				sum_col = sqrtf(max(sum_col, 0.0f));
+			}
+			else {
+				sum_col /= MATHS(A, col, col, stride);
+			}
+			MATHS(A, row, col, stride) = sum_col;
+		}
+	}
+}
+
+/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process.
+ *
+ * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
+ * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
+ * Since L is lower triangular, finding b is relatively easy since y is known.
+ * Then, the remaining problem is Lt*S = b, which again can be solved easily.
+ *
+ * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
+ * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
+ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
+{
+	/* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good
+	 * heuristic for the amount of pixels considered (with weighting), therefore the amount of correction
+	 * is scaled based on it. */
+	math_trimatrix_add_diagonal(A, n, 3e-7f*A[0], stride); /* Improve the numerical stability. */
+	math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
+
+	/* Use forward substitution to solve L*b = y, replacing y by b. */
+	for(int row = 0; row < n; row++) {
+		float3 sum = VECS(y, row, stride);
+		for(int col = 0; col < row; col++)
+			sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+	}
+
+	/* Use backward substitution to solve Lt*S = b, replacing b by S. */
+	for(int row = n-1; row >= 0; row--) {
+		float3 sum = VECS(y, row, stride);
+		for(int col = row+1; col < n; col++)
+			sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+	}
+}
+
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ */
+ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride)
+{
+	const float singular_epsilon = 1e-9f;
+
+	for (int row = 0; row < n; row++) {
+		for (int col = 0; col < n; col++) {
+			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+		}
+	}
+
+	for (int sweep = 0; sweep < 8; sweep++) {
+		float off_diagonal = 0.0f;
+		for (int row = 1; row < n; row++) {
+			for (int col = 0; col < row; col++) {
+				off_diagonal += fabsf(MAT(A, n, row, col));
+			}
+		}
+		if (off_diagonal < 1e-7f) {
+			/* The matrix has nearly reached diagonal form.
+			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
+			break;
+		}
+
+		/* Set the threshold for the small element rotation skip in the first sweep:
+		 * Skip all elements that are less than a tenth of the average off-diagonal element. */
+		float threshold = 0.2f*off_diagonal / (n*n);
+
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
+				/* Perform a Jacobi rotation on this element that reduces it to zero. */
+				float element = MAT(A, n, row, col);
+				float abs_element = fabsf(element);
+
+				/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
+				if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+					MAT(A, n, row, col) = 0.0f;
+					continue;
+				}
+
+				if(element == 0.0f) {
+					continue;
+				}
+
+				/* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */
+				if(sweep < 3 && (abs_element < threshold)) {
+					continue;
+				}
+
+				/* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi).
+				 * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case.
+				 * Then, we compute sin(phi) and cos(phi) themselves. */
+				float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
+				float ratio;
+				if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+					float cot_2phi = 0.5f*singular_diff / element;
+					ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
+					if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+				}
+				else {
+					ratio = element / singular_diff;
+				}
+
+				float c = 1.0f / sqrtf(1.0f + ratio*ratio);
+				float s = ratio*c;
+				/* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */
+				float tan_phi_2 = s / (1.0f + c);
+
+				/* Update the singular values in the diagonal. */
+				float singular_delta = ratio*element;
+				MAT(A, n, row, row) += singular_delta;
+				MAT(A, n, col, col) -= singular_delta;
+
+				/* Set the element itself to zero. */
+				MAT(A, n, row, col) = 0.0f;
+
+				/* Perform the actual rotations on the matrices. */
+#define ROT(M, r1, c1, r2, c2, stride)                                   \
+				{                                                        \
+					float M1 = MATS(M, n, r1, c1, stride);               \
+					float M2 = MATS(M, n, r2, c2, stride);               \
+					MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \
+					MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \
+				}
+
+				/* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */
+				for(int i = 0    ; i < col; i++) ROT(A, col, i, row, i, 1);
+				for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1);
+				for(int i = row+1; i < n  ; i++) ROT(A, i, col, i, row, 1);
+
+				for(int i = 0    ; i < n  ; i++) ROT(V, col, i, row, i, v_stride);
+#undef ROT
+			}
+		}
+	}
+
+	/* Sort eigenvalues and the associated eigenvectors. */
+	for (int i = 0; i < n - 1; i++) {
+		float v = MAT(A, n, i, i);
+		int k = i;
+		for (int j = i; j < n; j++) {
+			if (MAT(A, n, j, j) >= v) {
+				v = MAT(A, n, j, j);
+				k = j;
+			}
+		}
+		if (k != i) {
+			/* Swap eigenvalues. */
+			MAT(A, n, k, k) = MAT(A, n, i, i);
+			MAT(A, n, i, i) = v;
+			/* Swap eigenvectors. */
+			for (int j = 0; j < n; j++) {
+				float v = MATS(V, n, i, j, v_stride);
+				MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
+				MATS(V, n, k, j, v_stride) = v;
+			}
+		}
+	}
+}
+
+#ifdef __KERNEL_SSE3__
+ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+{
+	for(int i = 0; i < n; i++) {
+		A[i] = _mm_setzero_ps();
+	}
+}
+
+ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = _mm_setzero_ps();
+		}
+	}
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+		}
+	}
+}
+
+ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+{
+	for(int i = 0; i < n; i++) {
+		V[i] = _mm_add_ps(V[i], a[i]);
+	}
+}
+
+ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+{
+	for(int i = 0; i < n; i++) {
+		V[i] = _mm_mul_ps(V[i], a[i]);
+	}
+}
+
+ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+{
+	for(int i = 0; i < n; i++) {
+		a[i] = _mm_max_ps(a[i], b[i]);
+	}
+}
+
+ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+		}
+	}
+}
+#endif
+
+#undef MAT
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_MATH_MATRIX_H__ */
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index cd3067f7650..f9c3b4bb139 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -768,9 +768,17 @@ bool path_remove(const string& path)
 	return remove(path.c_str()) == 0;
 }
 
-static string line_directive(const string& path, int line)
+static string line_directive(const string& base, const string& path, int line)
 {
 	string escaped_path = path;
+	/* First we make path relative. */
+	if(string_startswith(escaped_path, base.c_str())) {
+		const string base_file = path_filename(base);
+		const size_t base_len = base.length();
+		escaped_path = base_file + escaped_path.substr(base_len,
+		                                               escaped_path.length() - base_len);
+	}
+	/* Second, we replace all unsafe characters. */
 	string_replace(escaped_path, "\"", "\\\"");
 	string_replace(escaped_path, "\'", "\\\'");
 	string_replace(escaped_path, "\?", "\\\?");
@@ -778,13 +786,13 @@ static string line_directive(const string& path, int line)
 	return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
 }
 
-
-string path_source_replace_includes(const string& source,
-                                    const string& path,
-                                    const string& source_filename)
+static string path_source_replace_includes_recursive(
+        const string& base,
+        const string& source,
+        const string& source_filepath)
 {
 	/* Our own little c preprocessor that replaces #includes with the file
-	 * contents, to work around issue of opencl drivers not supporting
+	 * contents, to work around issue of OpenCL drivers not supporting
 	 * include paths with spaces in them.
 	 */
 
@@ -799,23 +807,22 @@ string path_source_replace_includes(const string& source,
 			if(string_startswith(token, "include")) {
 				token = string_strip(token.substr(7, token.size() - 7));
 				if(token[0] == '"') {
-					size_t n_start = 1;
-					size_t n_end = token.find("\"", n_start);
-					string filename = token.substr(n_start, n_end - n_start);
-					string text, filepath = path_join(path, filename);
+					const size_t n_start = 1;
+					const size_t n_end = token.find("\"", n_start);
+					const string filename = token.substr(n_start, n_end - n_start);
+					string filepath = path_join(base, filename);
+					if(!path_exists(filepath)) {
+						filepath = path_join(path_dirname(source_filepath),
+						                     filename);
+					}
+					string text;
 					if(path_read_text(filepath, text)) {
-						/* Replace include directories with both current path
-						 * and path extracted from the include file.
-						 * Not totally robust, but works fine for Cycles kernel
-						 * and avoids having list of include directories.x
-						 */
-						text = path_source_replace_includes(
-						        text, path_dirname(filepath), filename);
-						text = path_source_replace_includes(text, path, filename);
+						text = path_source_replace_includes_recursive(
+						        base, text, filepath);
 						/* Use line directives for better error messages. */
-						line = line_directive(filepath, 1)
+						line = line_directive(base, filepath, 1)
 						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(path_join(path, source_filename), i + 1);
+						     + line_directive(base, source_filepath, i + 1);
 					}
 				}
 			}
@@ -826,6 +833,16 @@ string path_source_replace_includes(const string& source,
 	return result;
 }
 
+string path_source_replace_includes(const string& source,
+                                    const string& path,
+                                    const string& source_filename)
+{
+	return path_source_replace_includes_recursive(
+	        path,
+	        source,
+	        path_join(path, source_filename));
+}
+
 FILE *path_fopen(const string& path, const string& mode)
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 39c1eed04e7..134383e88db 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -37,9 +37,11 @@ public:
 		pixel_samples = 0;
 		total_pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
+		end_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -75,9 +77,11 @@ public:
 		pixel_samples = 0;
 		total_pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
+		end_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -144,6 +148,7 @@ public:
 		thread_scoped_lock lock(progress_mutex);
 
 		start_time = time_dt();
+		end_time = 0.0;
 	}
 
 	void set_render_start_time()
@@ -167,8 +172,15 @@ public:
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		total_time_ = time_dt() - start_time;
-		render_time_ = time_dt() - render_start_time;
+		double time = (end_time > 0) ? end_time : time_dt();
+
+		total_time_ = time - start_time;
+		render_time_ = time - render_start_time;
+	}
+
+	void set_end_time()
+	{
+		end_time = time_dt();
 	}
 
 	void reset_sample()
@@ -177,7 +189,8 @@ public:
 
 		pixel_samples = 0;
 		current_tile_sample = 0;
-		finished_tiles = 0;
+		rendered_tiles = 0;
+		denoised_tiles = 0;
 	}
 
 	void set_total_pixel_samples(uint64_t total_pixel_samples_)
@@ -209,23 +222,36 @@ public:
 		set_update();
 	}
 
-	void add_finished_tile()
+	void add_finished_tile(bool denoised)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		finished_tiles++;
+		if(denoised) {
+			denoised_tiles++;
+		}
+		else {
+			rendered_tiles++;
+		}
 	}
 
 	int get_current_sample()
 	{
+		thread_scoped_lock lock(progress_mutex);
 		/* Note that the value here always belongs to the last tile that updated,
 		 * so it's only useful if there is only one active tile. */
 		return current_tile_sample;
 	}
 
-	int get_finished_tiles()
+	int get_rendered_tiles()
+	{
+		thread_scoped_lock lock(progress_mutex);
+		return rendered_tiles;
+	}
+
+	int get_denoised_tiles()
 	{
-		return finished_tiles;
+		thread_scoped_lock lock(progress_mutex);
+		return denoised_tiles;
 	}
 
 	/* status messages */
@@ -318,9 +344,11 @@ protected:
 	int current_tile_sample;
 	/* Stores the number of tiles that's already finished.
 	 * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */
-	int finished_tiles;
+	int rendered_tiles, denoised_tiles;
 
 	double start_time, render_start_time;
+	/* End time written when render is done, so it doesn't keep increasing on redraws. */
+	double end_time;
 
 	string status;
 	string substatus;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 557809a5719..587febe3e52 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v)
 static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
 static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 
+#ifdef __KERNEL_SSE3__
 /* Emulation of SSE4 functions with SSE3 */
-
-#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__)
+#  ifndef __KERNEL_SSE41__
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -341,42 +341,48 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_TO_ZERO           0x03
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
+#undef _mm_blendv_ps
 #define _mm_blendv_ps __emu_mm_blendv_ps
 __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
     return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
 }
 
+#undef _mm_blend_ps
 #define _mm_blend_ps __emu_mm_blend_ps
 __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
+#undef _mm_blendv_epi8
 #define _mm_blendv_epi8 __emu_mm_blendv_epi8
 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
+#undef _mm_mullo_epi32
 #define _mm_mullo_epi32 __emu_mm_mullo_epi32
 __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
   __m128i rvalue;
   char* _r = (char*)(&rvalue + 1);
   char* _v = (char*)(& value + 1);
   char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
+  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
   return rvalue;
 }
 
-
+#undef _mm_min_epi32
 #define _mm_min_epi32 __emu_mm_min_epi32
 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
+#undef _mm_max_epi32
 #define _mm_max_epi32 __emu_mm_max_epi32
 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
+#undef _mm_extract_epi32
 #define _mm_extract_epi32 __emu_mm_extract_epi32
 __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   switch ( index ) {
@@ -388,20 +394,24 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   }
 }
 
+#undef _mm_insert_epi32
 #define _mm_insert_epi32 __emu_mm_insert_epi32
 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
+#undef _mm_extract_ps
 #define _mm_extract_ps __emu_mm_extract_ps
 __forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32* ptr = (int32*)&input; return ptr[index];
+  int32_t* ptr = (int32_t*)&input; return ptr[index];
 }
 
+#undef _mm_insert_ps
 #define _mm_insert_ps __emu_mm_insert_ps
 __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
 { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
 
+#undef _mm_round_ps
 #define _mm_round_ps __emu_mm_round_ps
 __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 {
@@ -415,18 +425,55 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#ifdef _M_X64
+#    ifdef _M_X64
+#undef _mm_insert_epi64
 #define _mm_insert_epi64 __emu_mm_insert_epi64
 __forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
     assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
 }
 
+#undef _mm_extract_epi64
 #define _mm_extract_epi64 __emu_mm_extract_epi64
 __forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
     assert(size_t(index) < 2); 
     return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
 }
-#endif
+#    endif
+
+#  endif
+
+#undef _mm_fabs_ps
+#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
+
+/* Return a __m128 with every element set to the largest element of v. */
+ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
+{
+  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
+  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
+  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
+  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
+  return v;
+}
+
+/* Return the sum of the four elements of x. */
+ccl_device_inline float _mm_hsum_ss(__m128 x)
+{
+    __m128 a = _mm_movehdup_ps(x);
+    __m128 b = _mm_add_ps(x, a);
+    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
+}
+
+/* Return a __m128 with every element set to the sum of the four elements of x. */
+ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
+{
+    x = _mm_hadd_ps(x, x);
+    x = _mm_hadd_ps(x, x);
+    return x;
+}
+
+/* Replace elements of x with zero where mask isn't set. */
+#undef _mm_mask_ps
+#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
 
 #endif
 
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index a1008d510d1..94ad512982c 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other)
 string string_remove_trademark(const string &s)
 {
 	string result = s;
+
+	/* Special case, so we don;t leave sequential spaces behind. */
+	/* TODO(sergey): Consider using regex perhaps? */
+	string_replace(result, " (TM)", "");
+	string_replace(result, " (R)", "");
+
 	string_replace(result, "(TM)", "");
 	string_replace(result, "(R)", "");
 
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index fb0c34e1dc4..6ed97b0e0a6 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads)
 		threads.resize(num_threads);
 
 		const int num_groups = system_cpu_group_count();
-		unsigned short num_process_groups;
+		unsigned short num_process_groups = 0;
 		vector<unsigned short> process_groups;
-		int current_group_threads;
+		int current_group_threads = 0;
 		if(num_groups > 1) {
 			process_groups.resize(num_groups);
 			num_process_groups = system_cpu_process_groups(num_groups, 
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index aff928ea2ee..df255f43059 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -21,62 +21,22 @@ CCL_NAMESPACE_BEGIN
 
 /* Texture limits on devices. */
 
-/* CPU */
-#define TEX_NUM_FLOAT4_CPU		1024
-#define TEX_NUM_BYTE4_CPU		1024
-#define TEX_NUM_HALF4_CPU		1024
-#define TEX_NUM_FLOAT_CPU		1024
-#define TEX_NUM_BYTE_CPU		1024
-#define TEX_NUM_HALF_CPU		1024
-#define TEX_START_FLOAT4_CPU	0
-#define TEX_START_BYTE4_CPU		TEX_NUM_FLOAT4_CPU
-#define TEX_START_HALF4_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU)
-#define TEX_START_FLOAT_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU)
-#define TEX_START_BYTE_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU)
-#define TEX_START_HALF_CPU		(TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU + TEX_NUM_BYTE_CPU)
-
 /* CUDA (Geforce 4xx and 5xx) */
-#define TEX_NUM_FLOAT4_CUDA		5
-#define TEX_NUM_BYTE4_CUDA		85
-#define TEX_NUM_HALF4_CUDA		0
-#define TEX_NUM_FLOAT_CUDA		0
-#define TEX_NUM_BYTE_CUDA		0
-#define TEX_NUM_HALF_CUDA		0
-#define TEX_START_FLOAT4_CUDA	0
-#define TEX_START_BYTE4_CUDA	TEX_NUM_FLOAT4_CUDA
-#define TEX_START_HALF4_CUDA	(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA)
-#define TEX_START_FLOAT_CUDA	(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA)
-#define TEX_START_BYTE_CUDA		(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA)
-#define TEX_START_HALF_CUDA		(TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA)
-
-/* CUDA (Kepler, Geforce 6xx and above) */
-#define TEX_NUM_FLOAT4_CUDA_KEPLER		1024
-#define TEX_NUM_BYTE4_CUDA_KEPLER		1024
-#define TEX_NUM_HALF4_CUDA_KEPLER		1024
-#define TEX_NUM_FLOAT_CUDA_KEPLER		1024
-#define TEX_NUM_BYTE_CUDA_KEPLER		1024
-#define TEX_NUM_HALF_CUDA_KEPLER		1024
-#define TEX_START_FLOAT4_CUDA_KEPLER	0
-#define TEX_START_BYTE4_CUDA_KEPLER		TEX_NUM_FLOAT4_CUDA_KEPLER
-#define TEX_START_HALF4_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER)
-#define TEX_START_FLOAT_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER)
-#define TEX_START_BYTE_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER)
-#define TEX_START_HALF_CUDA_KEPLER		(TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER + TEX_NUM_BYTE_CUDA_KEPLER)
-
-/* OpenCL */
-#define TEX_NUM_FLOAT4_OPENCL	1024
-#define TEX_NUM_BYTE4_OPENCL	1024
-#define TEX_NUM_HALF4_OPENCL	0
-#define TEX_NUM_FLOAT_OPENCL	1024
-#define TEX_NUM_BYTE_OPENCL		1024
-#define TEX_NUM_HALF_OPENCL		0
-#define TEX_START_FLOAT4_OPENCL	0
-#define TEX_START_BYTE4_OPENCL	TEX_NUM_FLOAT4_OPENCL
-#define TEX_START_HALF4_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL)
-#define TEX_START_FLOAT_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL)
-#define TEX_START_BYTE_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL)
-#define TEX_START_HALF_OPENCL	(TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL + TEX_NUM_BYTE_OPENCL)
-
+#define TEX_NUM_FLOAT4_CUDA      5
+#define TEX_NUM_BYTE4_CUDA       84
+#define TEX_NUM_HALF4_CUDA       0
+#define TEX_NUM_FLOAT_CUDA       0
+#define TEX_NUM_BYTE_CUDA        0
+#define TEX_NUM_HALF_CUDA        0
+#define TEX_START_FLOAT4_CUDA    0
+#define TEX_START_BYTE4_CUDA     TEX_NUM_FLOAT4_CUDA
+#define TEX_START_HALF4_CUDA     (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA)
+#define TEX_START_FLOAT_CUDA     (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA)
+#define TEX_START_BYTE_CUDA      (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA)
+#define TEX_START_HALF_CUDA      (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA)
+
+/* Any architecture other than old CUDA cards */
+#define TEX_NUM_MAX (INT_MAX >> 4)
 
 /* Color to use when textures are not found. */
 #define TEX_IMAGE_MISSING_R 1
@@ -84,6 +44,14 @@ CCL_NAMESPACE_BEGIN
 #define TEX_IMAGE_MISSING_B 1
 #define TEX_IMAGE_MISSING_A 1
 
+#if defined (__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+#  define kernel_tex_type(tex) (tex < TEX_START_BYTE4_CUDA ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_BYTE4)
+#  define kernel_tex_index(tex) (tex)
+#else
+#  define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK)
+#  define kernel_tex_index(tex) (tex >> IMAGE_DATA_TYPE_SHIFT)
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TEXTURE_H__ */
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index aa22f6a2c57..a5d1d7152d5 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -18,78 +18,75 @@
 #define __UTIL_TYPES_H__
 
 #ifndef __KERNEL_OPENCL__
-
-#include <stdlib.h>
-
+#  include <stdlib.h>
 #endif
 
 /* Bitness */
 
 #if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#define __KERNEL_64_BIT__
+#  define __KERNEL_64_BIT__
 #endif
 
 /* Qualifiers for kernel code shared by CPU and GPU */
 
 #ifndef __KERNEL_GPU__
-
-#define ccl_device static inline
-#define ccl_device_noinline static
-#define ccl_global
-#define ccl_constant
-#define ccl_local
-#define ccl_local_param
-#define ccl_private
-#define ccl_restrict __restrict
-#define __KERNEL_WITH_SSE_ALIGN__
-
-#if defined(_WIN32) && !defined(FREE_WINDOWS)
-#define ccl_device_inline static __forceinline
-#define ccl_device_forceinline static __forceinline
-#define ccl_align(...) __declspec(align(__VA_ARGS__))
-#ifdef __KERNEL_64_BIT__
-#define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#else
-#undef __KERNEL_WITH_SSE_ALIGN__
-#define ccl_try_align(...) /* not support for function arguments (error C2719) */
-#endif
-#define ccl_may_alias
-#define ccl_always_inline __forceinline
-#define ccl_never_inline __declspec(noinline)
-#define ccl_maybe_unused
-
-#else
-
-#define ccl_device_inline static inline __attribute__((always_inline))
-#define ccl_device_forceinline static inline __attribute__((always_inline))
-#define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#ifndef FREE_WINDOWS64
-#define __forceinline inline __attribute__((always_inline))
-#endif
-#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#define ccl_may_alias __attribute__((__may_alias__))
-#define ccl_always_inline __attribute__((always_inline))
-#define ccl_never_inline __attribute__((noinline))
-#define ccl_maybe_unused __attribute__((used))
-
-#endif
-
-#endif
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_global
+#  define ccl_constant
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else  /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+#      define ccl_try_align(...)
+#    endif  /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else  /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif  /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#  else
+#    define ATTR_FALLTHROUGH ((void)0)
+#  endif
+#endif  /* __KERNEL_GPU__ */
 
 /* Standard Integer Types */
 
 #ifndef __KERNEL_GPU__
-
 /* int8_t, uint16_t, and friends */
-#ifndef _WIN32
-#include <stdint.h>
-#endif
-
+#  ifndef _WIN32
+#    include <stdint.h>
+#  endif
 /* SIMD Types */
-
-#include "util/util_optimization.h"
-
-#endif
+#  include "util/util_optimization.h"
+#endif  /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_BEGIN
 
@@ -102,24 +99,19 @@ CCL_NAMESPACE_BEGIN
 /* Shorter Unsigned Names */
 
 #ifndef __KERNEL_OPENCL__
-
 typedef unsigned char uchar;
 typedef unsigned int uint;
-
+typedef unsigned short ushort;
 #endif
 
 /* Fixed Bits Types */
 
 #ifdef __KERNEL_OPENCL__
-
 typedef ulong uint64_t;
-
 #endif
 
 #ifndef __KERNEL_GPU__
-
-#ifdef _WIN32
-
+#  ifdef _WIN32
 typedef signed char int8_t;
 typedef unsigned char uint8_t;
 
@@ -131,360 +123,26 @@ typedef unsigned int uint32_t;
 
 typedef long long int64_t;
 typedef unsigned long long uint64_t;
-
-#ifdef __KERNEL_64_BIT__
+#    ifdef __KERNEL_64_BIT__
 typedef int64_t ssize_t;
-#else
+#    else
 typedef int32_t ssize_t;
-#endif
-
-#endif
+#    endif
+#  endif  /* _WIN32 */
 
 /* Generic Memory Pointer */
 
 typedef uint64_t device_ptr;
+#endif  /* __KERNEL_GPU__ */
 
-/* Vector Types */
-
-struct uchar2 {
-	uchar x, y;
-
-	__forceinline uchar operator[](int i) const { return *(&x + i); }
-	__forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct uchar3 {
-	uchar x, y, z;
-
-	__forceinline uchar operator[](int i) const { return *(&x + i); }
-	__forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct uchar4 {
-	uchar x, y, z, w;
-
-	__forceinline uchar operator[](int i) const { return *(&x + i); }
-	__forceinline uchar& operator[](int i) { return *(&x + i); }
-};
-
-struct int2 {
-	int x, y;
-
-	__forceinline int operator[](int i) const { return *(&x + i); }
-	__forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) int3 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128i m128;
-		struct { int x, y, z, w; };
-	};
-
-	__forceinline int3() {}
-	__forceinline int3(const __m128i& a) : m128(a) {}
-	__forceinline operator const __m128i&(void) const { return m128; }
-	__forceinline operator __m128i&(void) { return m128; }
-
-	int3(const int3& a) { m128 = a.m128; }
-	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
-#else
-	int x, y, z, w;
-#endif
-
-	__forceinline int operator[](int i) const { return *(&x + i); }
-	__forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) int4 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128i m128;
-		struct { int x, y, z, w; };
-	};
-
-	__forceinline int4() {}
-	__forceinline int4(const __m128i& a) : m128(a) {}
-	__forceinline operator const __m128i&(void) const { return m128; }
-	__forceinline operator __m128i&(void) { return m128; }
-
-	int4(const int4& a) : m128(a.m128) {}
-	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
-#else
-	int x, y, z, w;
-#endif
-
-	__forceinline int operator[](int i) const { return *(&x + i); }
-	__forceinline int& operator[](int i) { return *(&x + i); }
-};
-
-struct uint2 {
-	uint x, y;
-
-	__forceinline uint operator[](uint i) const { return *(&x + i); }
-	__forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct uint3 {
-	uint x, y, z;
-
-	__forceinline uint operator[](uint i) const { return *(&x + i); }
-	__forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct uint4 {
-	uint x, y, z, w;
-
-	__forceinline uint operator[](uint i) const { return *(&x + i); }
-	__forceinline uint& operator[](uint i) { return *(&x + i); }
-};
-
-struct float2 {
-	float x, y;
-
-	__forceinline float operator[](int i) const { return *(&x + i); }
-	__forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) float3 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128 m128;
-		struct { float x, y, z, w; };
-	};
-
-	__forceinline float3() {}
-	__forceinline float3(const __m128& a) : m128(a) {}
-	__forceinline operator const __m128&(void) const { return m128; }
-	__forceinline operator __m128&(void) { return m128; }
-
-	__forceinline float3(const float3& a) : m128(a.m128) {}
-	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
-#else
-	float x, y, z, w;
-#endif
-
-	__forceinline float operator[](int i) const { return *(&x + i); }
-	__forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-struct ccl_try_align(16) float4 {
-#ifdef __KERNEL_SSE__
-	union {
-		__m128 m128;
-		struct { float x, y, z, w; };
-	};
-
-	__forceinline float4() {}
-	__forceinline float4(const __m128& a) : m128(a) {}
-	__forceinline operator const __m128&(void) const { return m128; }
-	__forceinline operator __m128&(void) { return m128; }
-
-	__forceinline float4(const float4& a) : m128(a.m128) {}
-	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
-
-#else
-	float x, y, z, w;
-#endif
-
-	__forceinline float operator[](int i) const { return *(&x + i); }
-	__forceinline float& operator[](int i) { return *(&x + i); }
-};
-
-template<typename T>
-class vector3
-{
-public:
-	T x, y, z;
-
-	ccl_always_inline vector3() {}
-	ccl_always_inline vector3(const T& a)
-	  : x(a), y(a), z(a) {}
-	ccl_always_inline vector3(const T& x, const T& y, const T& z)
-	  : x(x), y(y), z(z) {}
-};
-
-#endif
-
-#ifndef __KERNEL_GPU__
-
-/* Vector Type Constructors
- * 
- * OpenCL does not support C++ class, so we use these instead. */
-
-ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
-{
-	uchar2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
-{
-	uchar3 a = {x, y, z};
-	return a;
-}
-
-ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
-{
-	uchar4 a = {x, y, z, w};
-	return a;
-}
-
-ccl_device_inline int2 make_int2(int x, int y)
-{
-	int2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline int3 make_int3(int x, int y, int z)
-{
-#ifdef __KERNEL_SSE__
-	int3 a;
-	a.m128 = _mm_set_epi32(0, z, y, x);
-#else
-	int3 a = {x, y, z, 0};
-#endif
-
-	return a;
-}
-
-ccl_device_inline int4 make_int4(int x, int y, int z, int w)
-{
-#ifdef __KERNEL_SSE__
-	int4 a;
-	a.m128 = _mm_set_epi32(w, z, y, x);
-#else
-	int4 a = {x, y, z, w};
-#endif
-
-	return a;
-}
-
-ccl_device_inline uint2 make_uint2(uint x, uint y)
-{
-	uint2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
-{
-	uint3 a = {x, y, z};
-	return a;
-}
-
-ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
-{
-	uint4 a = {x, y, z, w};
-	return a;
-}
-
-ccl_device_inline float2 make_float2(float x, float y)
-{
-	float2 a = {x, y};
-	return a;
-}
-
-ccl_device_inline float3 make_float3(float x, float y, float z)
-{
-#ifdef __KERNEL_SSE__
-	float3 a;
-	a.m128 = _mm_set_ps(0.0f, z, y, x);
-#else
-	float3 a = {x, y, z, 0.0f};
-#endif
-
-	return a;
-}
-
-ccl_device_inline float4 make_float4(float x, float y, float z, float w)
-{
-#ifdef __KERNEL_SSE__
-	float4 a;
-	a.m128 = _mm_set_ps(w, z, y, x);
-#else
-	float4 a = {x, y, z, w};
-#endif
-
-	return a;
-}
-
-ccl_device_inline int3 make_int3(int i)
-{
-#ifdef __KERNEL_SSE__
-	int3 a;
-	a.m128 = _mm_set1_epi32(i);
-#else
-	int3 a = {i, i, i, i};
-#endif
-
-	return a;
-}
-
-ccl_device_inline int4 make_int4(int i)
-{
-#ifdef __KERNEL_SSE__
-	int4 a;
-	a.m128 = _mm_set1_epi32(i);
-#else
-	int4 a = {i, i, i, i};
-#endif
-
-	return a;
-}
-
-ccl_device_inline float3 make_float3(float f)
-{
-#ifdef __KERNEL_SSE__
-	float3 a;
-	a.m128 = _mm_set1_ps(f);
-#else
-	float3 a = {f, f, f, f};
-#endif
-
-	return a;
-}
-
-ccl_device_inline float4 make_float4(float f)
-{
-#ifdef __KERNEL_SSE__
-	float4 a;
-	a.m128 = _mm_set1_ps(f);
-#else
-	float4 a = {f, f, f, f};
-#endif
-
-	return a;
-}
-
-ccl_device_inline float4 make_float4(const int4& i)
-{
-#ifdef __KERNEL_SSE__
-	float4 a;
-	a.m128 = _mm_cvtepi32_ps(i.m128);
-#else
-	float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
-#endif
-
-	return a;
-}
-
-ccl_device_inline int4 make_int4(const float3& f)
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
 {
-#ifdef __KERNEL_SSE__
-	int4 a;
-	a.m128 = _mm_cvtps_epi32(f.m128);
-#else
-	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-#endif
-
-	return a;
+	return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-#endif
-
-ccl_device_inline size_t align_up(size_t offset, size_t alignment)
+ccl_device_inline size_t divide_up(size_t x, size_t y)
 {
-	return (offset + alignment - 1) & ~(alignment - 1);
+	return (x + y - 1) / y;
 }
 
 ccl_device_inline size_t round_up(size_t x, size_t multiple)
@@ -509,6 +167,25 @@ enum InterpolationType {
 	INTERPOLATION_NUM_TYPES,
 };
 
+/* Texture types
+ * Since we store the type in the lower bits of a flat index,
+ * the shift and bit mask constant below need to be kept in sync.
+ */
+
+enum ImageDataType {
+	IMAGE_DATA_TYPE_FLOAT4 = 0,
+	IMAGE_DATA_TYPE_BYTE4 = 1,
+	IMAGE_DATA_TYPE_HALF4 = 2,
+	IMAGE_DATA_TYPE_FLOAT = 3,
+	IMAGE_DATA_TYPE_BYTE = 4,
+	IMAGE_DATA_TYPE_HALF = 5,
+
+	IMAGE_DATA_NUM_TYPES
+};
+
+#define IMAGE_DATA_TYPE_SHIFT 3
+#define IMAGE_DATA_TYPE_MASK 0x7
+
 /* Extension types for textures.
  *
  * Defines how the image is extrapolated past its original bounds.
@@ -554,7 +231,7 @@ template<typename T> static inline T decltype_helper(T x) { return x; }
  * ... the compiler optimizes away the temp var */
 #ifdef __GNUC__
 #define CHECK_TYPE(var, type)  {  \
-	TYPEOF(var) *__tmp;         \
+	TYPEOF(var) *__tmp;           \
 	__tmp = (type *)NULL;         \
 	(void)__tmp;                  \
 } (void)0
@@ -576,5 +253,50 @@ template<typename T> static inline T decltype_helper(T x) { return x; }
 
 CCL_NAMESPACE_END
 
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement)  assert(statement)
+#else
+#  define util_assert(statement)
+#endif
+
+/* Vectorized types declaration. */
+#include "util/util_types_uchar2.h"
+#include "util/util_types_uchar3.h"
+#include "util/util_types_uchar4.h"
+
+#include "util/util_types_int2.h"
+#include "util/util_types_int3.h"
+#include "util/util_types_int4.h"
+
+#include "util/util_types_uint2.h"
+#include "util/util_types_uint3.h"
+#include "util/util_types_uint4.h"
+
+#include "util/util_types_float2.h"
+#include "util/util_types_float3.h"
+#include "util/util_types_float4.h"
+
+#include "util/util_types_vector3.h"
+
+/* Vectorized types implementation. */
+#include "util/util_types_uchar2_impl.h"
+#include "util/util_types_uchar3_impl.h"
+#include "util/util_types_uchar4_impl.h"
+
+#include "util/util_types_int2_impl.h"
+#include "util/util_types_int3_impl.h"
+#include "util/util_types_int4_impl.h"
+
+#include "util/util_types_uint2_impl.h"
+#include "util/util_types_uint3_impl.h"
+#include "util/util_types_uint4_impl.h"
+
+#include "util/util_types_float2_impl.h"
+#include "util/util_types_float3_impl.h"
+#include "util/util_types_float4_impl.h"
+
+#include "util/util_types_vector3_impl.h"
+
 #endif /* __UTIL_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_types_float2.h b/intern/cycles/util/util_types_float2.h
new file mode 100644
index 00000000000..ec7a1f717a1
--- /dev/null
+++ b/intern/cycles/util/util_types_float2.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT2_H__
+#define __UTIL_TYPES_FLOAT2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct float2 {
+	float x, y;
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float2 make_float2(float x, float y);
+ccl_device_inline void print_float2(const char *label, const float2& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_types_float2_impl.h b/intern/cycles/util/util_types_float2_impl.h
new file mode 100644
index 00000000000..782dda195eb
--- /dev/null
+++ b/intern/cycles/util/util_types_float2_impl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__
+#define __UTIL_TYPES_FLOAT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline float float2::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+__forceinline float& float2::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline float2 make_float2(float x, float y)
+{
+	float2 a = {x, y};
+	return a;
+}
+
+ccl_device_inline void print_float2(const char *label, const float2& a)
+{
+	printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h
new file mode 100644
index 00000000000..28146ad04f7
--- /dev/null
+++ b/intern/cycles/util/util_types_float3.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT3_H__
+#define __UTIL_TYPES_FLOAT3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct ccl_try_align(16) float3 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float3();
+	__forceinline float3(const float3& a);
+	__forceinline explicit float3(const __m128& a);
+
+	__forceinline operator const __m128&(void) const;
+	__forceinline operator __m128&(void);
+
+	__forceinline float3& operator =(const float3& a);
+#else  /* __KERNEL_SSE__ */
+	float x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float3 make_float3(float f);
+ccl_device_inline float3 make_float3(float x, float y, float z);
+ccl_device_inline void print_float3(const char *label, const float3& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h
new file mode 100644
index 00000000000..45f61767d3f
--- /dev/null
+++ b/intern/cycles/util/util_types_float3_impl.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__
+#define __UTIL_TYPES_FLOAT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline float3::float3()
+{
+}
+
+__forceinline float3::float3(const float3& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline float3::float3(const __m128& a)
+        : m128(a)
+{
+}
+
+__forceinline float3::operator const __m128&(void) const
+{
+	return m128;
+}
+
+__forceinline float3::operator __m128&(void)
+{
+	return m128;
+}
+
+__forceinline float3& float3::operator =(const float3& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline float float3::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline float& float3::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline float3 make_float3(float f)
+{
+#ifdef __KERNEL_SSE__
+	float3 a(_mm_set1_ps(f));
+#else
+	float3 a = {f, f, f, f};
+#endif
+	return a;
+}
+
+ccl_device_inline float3 make_float3(float x, float y, float z)
+{
+#ifdef __KERNEL_SSE__
+	float3 a(_mm_set_ps(0.0f, z, y, x));
+#else
+	float3 a = {x, y, z, 0.0f};
+#endif
+	return a;
+}
+
+ccl_device_inline void print_float3(const char *label, const float3& a)
+{
+	printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h
new file mode 100644
index 00000000000..a7d9abe1b95
--- /dev/null
+++ b/intern/cycles/util/util_types_float4.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT4_H__
+#define __UTIL_TYPES_FLOAT4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct int4;
+
+struct ccl_try_align(16) float4 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128 m128;
+		struct { float x, y, z, w; };
+	};
+
+	__forceinline float4();
+	__forceinline float4(const float4& a);
+	__forceinline explicit float4(const __m128& a);
+
+	__forceinline operator const __m128&(void) const;
+	__forceinline operator __m128&(void);
+
+	__forceinline float4& operator =(const float4& a);
+
+#else  /* __KERNEL_SSE__ */
+	float x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float4 make_float4(float f);
+ccl_device_inline float4 make_float4(float x, float y, float z, float w);
+ccl_device_inline float4 make_float4(const int4& i);
+ccl_device_inline void print_float4(const char *label, const float4& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h
new file mode 100644
index 00000000000..ff3ec4d4ecf
--- /dev/null
+++ b/intern/cycles/util/util_types_float4_impl.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__
+#define __UTIL_TYPES_FLOAT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline float4::float4()
+{
+}
+
+__forceinline float4::float4(const float4& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline float4::float4(const __m128& a)
+        : m128(a)
+{
+}
+
+__forceinline float4::operator const __m128&(void) const
+{
+	return m128;
+}
+
+__forceinline float4::operator __m128&(void)
+{
+	return m128;
+}
+
+__forceinline float4& float4::operator =(const float4& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline float float4::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+__forceinline float& float4::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+ccl_device_inline float4 make_float4(float f)
+{
+#ifdef __KERNEL_SSE__
+	float4 a(_mm_set1_ps(f));
+#else
+	float4 a = {f, f, f, f};
+#endif
+	return a;
+}
+
+ccl_device_inline float4 make_float4(float x, float y, float z, float w)
+{
+#ifdef __KERNEL_SSE__
+	float4 a(_mm_set_ps(w, z, y, x));
+#else
+	float4 a = {x, y, z, w};
+#endif
+	return a;
+}
+
+ccl_device_inline float4 make_float4(const int4& i)
+{
+#ifdef __KERNEL_SSE__
+	float4 a(_mm_cvtepi32_ps(i.m128));
+#else
+	float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
+#endif
+	return a;
+}
+
+ccl_device_inline void print_float4(const char *label, const float4& a)
+{
+	printf("%s: %.8f %.8f %.8f %.8f\n",
+	       label, 
+	       (double)a.x, (double)a.y, (double)a.z, (double)a.w);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int2.h b/intern/cycles/util/util_types_int2.h
new file mode 100644
index 00000000000..82e860f89eb
--- /dev/null
+++ b/intern/cycles/util/util_types_int2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT2_H__
+#define __UTIL_TYPES_INT2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct int2 {
+	int x, y;
+
+	__forceinline int operator[](int i) const;
+	__forceinline int& operator[](int i);
+};
+
+ccl_device_inline int2 make_int2(int x, int y);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT2_H__ */
diff --git a/intern/cycles/util/util_types_int2_impl.h b/intern/cycles/util/util_types_int2_impl.h
new file mode 100644
index 00000000000..c7d3942e723
--- /dev/null
+++ b/intern/cycles/util/util_types_int2_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT2_IMPL_H__
+#define __UTIL_TYPES_INT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+int int2::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+int& int2::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline int2 make_int2(int x, int y)
+{
+	int2 a = {x, y};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h
new file mode 100644
index 00000000000..9d43b201c02
--- /dev/null
+++ b/intern/cycles/util/util_types_int3.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT3_H__
+#define __UTIL_TYPES_INT3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct ccl_try_align(16) int3 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int3();
+	__forceinline int3(const int3& a);
+	__forceinline explicit int3(const __m128i& a);
+
+	__forceinline operator const __m128i&(void) const;
+	__forceinline operator __m128i&(void);
+
+	__forceinline int3& operator =(const int3& a);
+#else  /* __KERNEL_SSE__ */
+	int x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline int operator[](int i) const;
+	__forceinline int& operator[](int i);
+};
+
+ccl_device_inline int3 make_int3(int i);
+ccl_device_inline int3 make_int3(int x, int y, int z);
+ccl_device_inline void print_int3(const char *label, const int3& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT3_H__ */
diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h
new file mode 100644
index 00000000000..ada50c4812c
--- /dev/null
+++ b/intern/cycles/util/util_types_int3_impl.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT3_IMPL_H__
+#define __UTIL_TYPES_INT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline int3::int3()
+{
+}
+
+__forceinline int3::int3(const __m128i& a)
+        : m128(a)
+{
+}
+
+__forceinline int3::int3(const int3& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline int3::operator const __m128i&(void) const
+{
+	return m128;
+}
+
+__forceinline int3::operator __m128i&(void)
+{
+	return m128;
+}
+
+__forceinline int3& int3::operator =(const int3& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline int int3::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline int& int3::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline int3 make_int3(int i)
+{
+#ifdef __KERNEL_SSE__
+	int3 a(_mm_set1_epi32(i));
+#else
+	int3 a = {i, i, i, i};
+#endif
+	return a;
+}
+
+ccl_device_inline int3 make_int3(int x, int y, int z)
+{
+#ifdef __KERNEL_SSE__
+	int3 a(_mm_set_epi32(0, z, y, x));
+#else
+	int3 a = {x, y, z, 0};
+#endif
+
+	return a;
+}
+
+ccl_device_inline void print_int3(const char *label, const int3& a)
+{
+	printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h
new file mode 100644
index 00000000000..cdd0ecbdae5
--- /dev/null
+++ b/intern/cycles/util/util_types_int4.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT4_H__
+#define __UTIL_TYPES_INT4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct float3;
+
+struct ccl_try_align(16) int4 {
+#ifdef __KERNEL_SSE__
+	union {
+		__m128i m128;
+		struct { int x, y, z, w; };
+	};
+
+	__forceinline int4();
+	__forceinline int4(const int4& a);
+	__forceinline explicit int4(const __m128i& a);
+
+	__forceinline operator const __m128i&(void) const;
+	__forceinline operator __m128i&(void);
+
+	__forceinline int4& operator=(const int4& a);
+#else  /* __KERNEL_SSE__ */
+	int x, y, z, w;
+#endif  /* __KERNEL_SSE__ */
+
+	__forceinline int operator[](int i) const;
+	__forceinline int& operator[](int i);
+};
+
+ccl_device_inline int4 make_int4(int i);
+ccl_device_inline int4 make_int4(int x, int y, int z, int w);
+ccl_device_inline int4 make_int4(const float3& f);
+ccl_device_inline void print_int4(const char *label, const int4& a);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT4_H__ */
diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h
new file mode 100644
index 00000000000..07cdc88f2dc
--- /dev/null
+++ b/intern/cycles/util/util_types_int4_impl.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_INT4_IMPL_H__
+#define __UTIL_TYPES_INT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_SSE__
+__forceinline int4::int4()
+{
+}
+
+__forceinline int4::int4(const int4& a)
+        : m128(a.m128)
+{
+}
+
+__forceinline int4::int4(const __m128i& a)
+        : m128(a)
+{
+}
+
+__forceinline int4::operator const __m128i&(void) const
+{
+	return m128;
+}
+
+__forceinline int4::operator __m128i&(void)
+{
+	return m128;
+}
+
+__forceinline int4& int4::operator=(const int4& a)
+{
+	m128 = a.m128;
+	return *this;
+}
+#endif  /* __KERNEL_SSE__ */
+
+__forceinline int int4::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+__forceinline int& int4::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+ccl_device_inline int4 make_int4(int i)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_set1_epi32(i));
+#else
+	int4 a = {i, i, i, i};
+#endif
+	return a;
+}
+
+ccl_device_inline int4 make_int4(int x, int y, int z, int w)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_set_epi32(w, z, y, x));
+#else
+	int4 a = {x, y, z, w};
+#endif
+	return a;
+}
+
+ccl_device_inline int4 make_int4(const float3& f)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_cvtps_epi32(f.m128));
+#else
+	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+	return a;
+}
+
+ccl_device_inline void print_int4(const char *label, const int4& a)
+{
+	printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_INT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar2.h b/intern/cycles/util/util_types_uchar2.h
new file mode 100644
index 00000000000..f618a2234ca
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR2_H__
+#define __UTIL_TYPES_UCHAR2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar2 {
+	uchar x, y;
+
+	__forceinline uchar operator[](int i) const;
+	__forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR2_H__ */
diff --git a/intern/cycles/util/util_types_uchar2_impl.h b/intern/cycles/util/util_types_uchar2_impl.h
new file mode 100644
index 00000000000..d5f196d0ce0
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar2_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR2_IMPL_H__
+#define __UTIL_TYPES_UCHAR2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar2::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+uchar& uchar2::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
+{
+	uchar2 a = {x, y};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar3.h b/intern/cycles/util/util_types_uchar3.h
new file mode 100644
index 00000000000..1e3644e6fd6
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar3.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR3_H__
+#define __UTIL_TYPES_UCHAR3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar3 {
+	uchar x, y, z;
+
+	__forceinline uchar operator[](int i) const;
+	__forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR3_H__ */
diff --git a/intern/cycles/util/util_types_uchar3_impl.h b/intern/cycles/util/util_types_uchar3_impl.h
new file mode 100644
index 00000000000..611021efb7f
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar3_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR3_IMPL_H__
+#define __UTIL_TYPES_UCHAR3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar3::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+uchar& uchar3::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
+{
+	uchar3 a = {x, y, z};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uchar4.h b/intern/cycles/util/util_types_uchar4.h
new file mode 100644
index 00000000000..3802cebbfb9
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR4_H__
+#define __UTIL_TYPES_UCHAR4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uchar4 {
+	uchar x, y, z, w;
+
+	__forceinline uchar operator[](int i) const;
+	__forceinline uchar& operator[](int i);
+};
+
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR4_H__ */
diff --git a/intern/cycles/util/util_types_uchar4_impl.h b/intern/cycles/util/util_types_uchar4_impl.h
new file mode 100644
index 00000000000..03039f60c54
--- /dev/null
+++ b/intern/cycles/util/util_types_uchar4_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UCHAR4_IMPL_H__
+#define __UTIL_TYPES_UCHAR4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+uchar uchar4::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+uchar& uchar4::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 4);
+	return *(&x + i);
+}
+
+ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
+{
+	uchar4 a = {x, y, z, w};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UCHAR4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint2.h b/intern/cycles/util/util_types_uint2.h
new file mode 100644
index 00000000000..c4a31899614
--- /dev/null
+++ b/intern/cycles/util/util_types_uint2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT2_H__
+#define __UTIL_TYPES_UINT2_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint2 {
+	uint x, y;
+
+	__forceinline uint operator[](uint i) const;
+	__forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint2 make_uint2(uint x, uint y);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT2_H__ */
diff --git a/intern/cycles/util/util_types_uint2_impl.h b/intern/cycles/util/util_types_uint2_impl.h
new file mode 100644
index 00000000000..b50ffa2667f
--- /dev/null
+++ b/intern/cycles/util/util_types_uint2_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT2_IMPL_H__
+#define __UTIL_TYPES_UINT2_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint2::operator[](uint i) const
+{
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+__forceinline uint& uint2::operator[](uint i)
+{
+	util_assert(i < 2);
+	return *(&x + i);
+}
+
+ccl_device_inline uint2 make_uint2(uint x, uint y)
+{
+	uint2 a = {x, y};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT2_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint3.h b/intern/cycles/util/util_types_uint3.h
new file mode 100644
index 00000000000..aeeecd2df06
--- /dev/null
+++ b/intern/cycles/util/util_types_uint3.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT3_H__
+#define __UTIL_TYPES_UINT3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint3 {
+	uint x, y, z;
+
+	__forceinline uint operator[](uint i) const;
+	__forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT3_H__ */
diff --git a/intern/cycles/util/util_types_uint3_impl.h b/intern/cycles/util/util_types_uint3_impl.h
new file mode 100644
index 00000000000..26005d5baff
--- /dev/null
+++ b/intern/cycles/util/util_types_uint3_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT3_IMPL_H__
+#define __UTIL_TYPES_UINT3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint3::operator[](uint i) const
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline uint& uint3::operator[](uint i)
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
+{
+	uint3 a = {x, y, z};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT3_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_uint4.h b/intern/cycles/util/util_types_uint4.h
new file mode 100644
index 00000000000..2d3a7bb85e4
--- /dev/null
+++ b/intern/cycles/util/util_types_uint4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT4_H__
+#define __UTIL_TYPES_UINT4_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+struct uint4 {
+	uint x, y, z, w;
+
+	__forceinline uint operator[](uint i) const;
+	__forceinline uint& operator[](uint i);
+};
+
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT4_H__ */
diff --git a/intern/cycles/util/util_types_uint4_impl.h b/intern/cycles/util/util_types_uint4_impl.h
new file mode 100644
index 00000000000..6d48131a446
--- /dev/null
+++ b/intern/cycles/util/util_types_uint4_impl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_UINT4_IMPL_H__
+#define __UTIL_TYPES_UINT4_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+__forceinline uint uint4::operator[](uint i) const
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+__forceinline uint& uint4::operator[](uint i)
+{
+	util_assert(i < 3);
+	return *(&x + i);
+}
+
+ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
+{
+	uint4 a = {x, y, z, w};
+	return a;
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_UINT4_IMPL_H__ */
diff --git a/intern/cycles/util/util_types_vector3.h b/intern/cycles/util/util_types_vector3.h
new file mode 100644
index 00000000000..12acf9dc959
--- /dev/null
+++ b/intern/cycles/util/util_types_vector3.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_VECTOR3_H__
+#define __UTIL_TYPES_VECTOR3_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+template<typename T>
+class vector3
+{
+public:
+	T x, y, z;
+
+	__forceinline vector3();
+	__forceinline vector3(const T& a);
+	__forceinline vector3(const T& x, const T& y, const T& z);
+};
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_VECTOR3_H__ */
diff --git a/intern/cycles/util/util_types_vector3_impl.h b/intern/cycles/util/util_types_vector3_impl.h
new file mode 100644
index 00000000000..2f6b8368540
--- /dev/null
+++ b/intern/cycles/util/util_types_vector3_impl.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__
+#define __UTIL_TYPES_VECTOR3_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+template<typename T>
+ccl_always_inline vector3<T>::vector3()
+{
+}
+
+template<typename T>
+ccl_always_inline vector3<T>::vector3(const T& a)
+        : x(a), y(a), z(a)
+{
+}
+
+template<typename T>
+ccl_always_inline vector3<T>::vector3(const T& x, const T& y, const T& z)
+        : x(x), y(y), z(z)
+{
+}
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_VECTOR3_IMPL_H__ */