Cycles: Implement denoising option for reducing noise in the rendered image

This commit contains the first part of the new Cycles denoising option, which filters the resulting image using information gathered during rendering to get rid of noise while preserving visual features as well as possible. To use the option, enable it in the render layer options. The default settings fit a wide range of scenes, but the user can tweak individual settings to control the tradeoff between a noise-free image, image details, and calculation time. Note that the denoiser may still change in the future and that some features are not implemented yet. The most important missing feature is animation denoising, which uses information from multiple frames at once to produce a flicker-free and smoother result. These features will be added in the future. Finally, thanks to all the people who supported this project: - Google (through the GSoC) and Theory Studios for sponsoring the development - The authors of the papers I used for implementing the denoiser (more details on them will be included in the technical docs) - The other Cycles devs for feedback on the code, especially Sergey for mentoring the GSoC project and Brecht for the code review! - And of course the users who helped with testing, reported bugs and things that could and/or should work better!
author: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-07 15:40:58 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-07 15:40:58 +0300
commit: 43b374e8c5430488a302298b1026faa1c3a231e9 (patch)
tree: 42e619a9fa08d02cef515b6315ce34dd7fd062b2 /intern/cycles
parent: bca697834728fd12c84941aa2a428abfe2090b27 (diff)
113 files changed, 6429 insertions, 1131 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index affeef994d4..b2694a285b1 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -238,3 +238,15 @@ def register_passes(engine, scene, srl):
     if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
     if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X", 'VALUE')
     if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X", 'VALUE')
+
+    if crl.use_denoising and crl.denoising_store_passes:
+        engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR');
+        engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR');
+        engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE');
+        engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE');
+        engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Image",           3, "RGB", 'COLOR');
+        engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR');
+\ No newline at end of file
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index a8a0f0bfc70..2ac1a1aacdf 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1195,6 +1195,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
                 default=False,
                 )
 
+        cls.use_denoising = BoolProperty(
+                name="Use Denoising",
+                description="Denoise the rendered image",
+                default=False,
+                )
+        cls.denoising_diffuse_direct = BoolProperty(
+                name="Diffuse Direct",
+                description="Denoise the direct diffuse lighting",
+                default=True,
+                )
+        cls.denoising_diffuse_indirect = BoolProperty(
+                name="Diffuse Indirect",
+                description="Denoise the indirect diffuse lighting",
+                default=True,
+                )
+        cls.denoising_glossy_direct = BoolProperty(
+                name="Glossy Direct",
+                description="Denoise the direct glossy lighting",
+                default=True,
+                )
+        cls.denoising_glossy_indirect = BoolProperty(
+                name="Glossy Indirect",
+                description="Denoise the indirect glossy lighting",
+                default=True,
+                )
+        cls.denoising_transmission_direct = BoolProperty(
+                name="Transmission Direct",
+                description="Denoise the direct transmission lighting",
+                default=True,
+                )
+        cls.denoising_transmission_indirect = BoolProperty(
+                name="Transmission Indirect",
+                description="Denoise the indirect transmission lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_direct = BoolProperty(
+                name="Subsurface Direct",
+                description="Denoise the direct subsurface lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_indirect = BoolProperty(
+                name="Subsurface Indirect",
+                description="Denoise the indirect subsurface lighting",
+                default=True,
+                )
+        cls.denoising_strength = FloatProperty(
+                name="Denoising Strength",
+                description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_feature_strength = FloatProperty(
+                name="Denoising Feature Strength",
+                description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_radius = IntProperty(
+                name="Denoising Radius",
+                description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
+                min=1, max=50,
+                default=8,
+        )
+        cls.denoising_relative_pca = BoolProperty(
+                name="Relative filter",
+                description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
+                default=False,
+        )
+        cls.denoising_store_passes = BoolProperty(
+                name="Store denoising passes",
+                description="Store the denoising feature passes and the noisy image",
+                default=False,
+        )
+
     @classmethod
     def unregister(cls):
         del bpy.types.SceneRenderLayer.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index ea5bc8979c0..4ed3ccd9a2c 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
+        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+           col.separator()
+           sub = col.column()
+           sub.active = crl.use_denoising
+           sub.prop(crl, "denoising_store_passes", text="Denoising")
+
         if _cycles.with_cycles_debug:
           col = layout.column()
           col.prop(crl, "pass_debug_bvh_traversed_nodes")
@@ -581,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
             row.prop(rv, "camera_suffix", text="")
 
 
+class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
+    bl_label = "Denoising"
+    bl_context = "render_layer"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+        cscene = context.scene.cycles
+        layout = self.layout
+
+        layout.active = not cscene.use_progressive_refine
+        layout.prop(crl, "use_denoising", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        cscene = scene.cycles
+        rd = scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+
+        layout.active = crl.use_denoising and not cscene.use_progressive_refine
+
+        split = layout.split()
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_radius", text="Radius")
+        sub.prop(crl, "denoising_strength", slider=True, text="Strength")
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
+        sub.prop(crl, "denoising_relative_pca")
+
+        layout.separator()
+
+        row = layout.row()
+        row.label(text="Diffuse:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Glossy:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Transmission:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Subsurface:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
+
+
 class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1734,6 +1805,7 @@ classes = (
     CyclesRender_PT_layer_options,
     CyclesRender_PT_layer_passes,
     CyclesRender_PT_views,
+    CyclesRender_PT_denoising,
     Cycles_PT_post_processing,
     CyclesCamera_PT_dof,
     Cycles_PT_context_material,
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index dfd6dc8db34..ec71800a9ea 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -299,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
 static void end_render_result(BL::RenderEngine& b_engine,
                               BL::RenderResult& b_rr,
                               bool cancel,
+                              bool highlight,
                               bool do_merge_results)
 {
-	b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
+	b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
 }
 
-void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
+void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
 {
 	BufferParams& params = rtile.buffers->params;
 	int x = params.full_x - session->tile_manager.params.full_x;
@@ -340,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 			update_render_result(b_rr, b_rlay, rtile);
 		}
 
-		end_render_result(b_engine, b_rr, true, true);
+		end_render_result(b_engine, b_rr, true, highlight, true);
 	}
 	else {
 		/* write result */
 		write_render_result(b_rr, b_rlay, rtile);
-		end_render_result(b_engine, b_rr, false, true);
+		end_render_result(b_engine, b_rr, false, false, true);
 	}
 }
 
 void BlenderSession::write_render_tile(RenderTile& rtile)
 {
-	do_write_update_render_tile(rtile, false);
+	do_write_update_render_tile(rtile, false, false);
 }
 
-void BlenderSession::update_render_tile(RenderTile& rtile)
+void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
 {
 	/* use final write for preview renders, otherwise render result wouldn't be
 	 * be updated in blender side
 	 * would need to be investigated a bit further, but for now shall be fine
 	 */
 	if(!b_engine.is_preview())
-		do_write_update_render_tile(rtile, true);
+		do_write_update_render_tile(rtile, true, highlight);
 	else
-		do_write_update_render_tile(rtile, false);
+		do_write_update_render_tile(rtile, false, false);
 }
 
 void BlenderSession::render()
 {
 	/* set callback to write out render results */
 	session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
+	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
 
 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@@ -391,7 +392,7 @@ void BlenderSession::render()
 
 		/* layer will be missing if it was disabled in the UI */
 		if(b_single_rlay == b_rr.layers.end()) {
-			end_render_result(b_engine, b_rr, true, false);
+			end_render_result(b_engine, b_rr, true, true, false);
 			continue;
 		}
 
@@ -407,6 +408,29 @@ void BlenderSession::render()
 		}
 
 		buffer_params.passes = passes;
+
+		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
+		bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
+		buffer_params.denoising_data_pass = use_denoising;
+		session->tile_manager.schedule_denoising = use_denoising;
+		session->params.use_denoising = use_denoising;
+		scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+		scene->film->denoising_flags = 0;
+		if(!get_boolean(crl, "denoising_diffuse_direct"))        scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
+		if(!get_boolean(crl, "denoising_diffuse_indirect"))      scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
+		if(!get_boolean(crl, "denoising_glossy_direct"))         scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
+		if(!get_boolean(crl, "denoising_glossy_indirect"))       scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
+		if(!get_boolean(crl, "denoising_transmission_direct"))   scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
+		if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
+		if(!get_boolean(crl, "denoising_subsurface_direct"))     scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
+		if(!get_boolean(crl, "denoising_subsurface_indirect"))   scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
+		scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+		buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
+		session->params.denoising_radius = get_int(crl, "denoising_radius");
+		session->params.denoising_strength = get_float(crl, "denoising_strength");
+		session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
+		session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
+
 		scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
 		scene->film->tag_passes_update(scene, passes);
 		scene->film->tag_update(scene);
@@ -460,7 +484,7 @@ void BlenderSession::render()
 		}
 
 		/* free result without merging */
-		end_render_result(b_engine, b_rr, true, false);
+		end_render_result(b_engine, b_rr, true, true, false);
 
 		if(session->progress.get_cancel())
 			break;
@@ -666,6 +690,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 				/* copy pixels */
 				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
 			}
+			else {
+				int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
+				if(denoising_offset >= 0) {
+					read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
+				}
+			}
 
 			if(!read) {
 				memset(&pixels[0], 0, pixels.size()*sizeof(float));
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 22b21a18f2e..536808c5b18 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -79,7 +79,7 @@ public:
 	void update_render_result(BL::RenderResult& b_rr,
 	                          BL::RenderLayer& b_rlay,
 	                          RenderTile& rtile);
-	void update_render_tile(RenderTile& rtile);
+	void update_render_tile(RenderTile& rtile, bool highlight);
 
 	/* interactive updates */
 	void synchronize();
@@ -147,7 +147,7 @@ protected:
 	                                   BL::RenderLayer& b_rlay,
 	                                   RenderTile& rtile,
 	                                   bool do_update_only);
-	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
+	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
 
 	int builtin_image_frame(const string &builtin_name);
 	void builtin_image_info(const string &builtin_name,
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index f362eade954..08ba535f282 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -525,6 +525,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
 	return PASS_NONE;
 }
 
+int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+	if(name.substr(0, 10) != "Denoising ") {
+		return -1;
+	}
+	name = name.substr(10);
+
+#define MAP_PASS(passname, offset) if(name == passname) return offset;
+	MAP_PASS("Normal", DENOISING_PASS_NORMAL);
+	MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
+	MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
+	MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
+	MAP_PASS("Depth", DENOISING_PASS_DEPTH);
+	MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
+	MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
+	MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
+	MAP_PASS("Image", DENOISING_PASS_COLOR);
+	MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+#undef MAP_PASS
+
+	return -1;
+}
+
 array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
                                             BL::SceneRenderLayer& b_srlay)
 {
@@ -544,8 +568,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 			Pass::add(pass_type, passes);
 	}
 
-#ifdef __KERNEL_DEBUG__
 	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+	if(get_boolean(crp, "denoising_store_passes")) {
+		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image",           3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
+	}
+#ifdef __KERNEL_DEBUG__
 	if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
 		b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
 		Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index aee39a5652a..0950285d976 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -96,6 +96,7 @@ public:
 	                                      int width, int height);
 
 	static PassType get_pass_type(BL::RenderPass& b_pass);
+	static int get_denoising_pass(BL::RenderPass& b_pass);
 
 private:
 	/* sync */
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6ef2aa1caad..74ec57ddf74 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
 	device.cpp
 	device_cpu.cpp
 	device_cuda.cpp
+	device_denoising.cpp
 	device_multi.cpp
 	device_opencl.cpp
 	device_split_kernel.cpp
@@ -48,6 +49,7 @@ endif()
 
 set(SRC_HEADERS
 	device.h
+	device_denoising.h
 	device_memory.h
 	device_intern.h
 	device_network.h
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index c024021b4b3..0603ecb3afb 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -402,4 +402,16 @@ void Device::free_memory()
 	devices.free_memory();
 }
 
+
+device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
+ : device(device)
+{
+	ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+	device->mem_free_sub_ptr(ptr);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 6051dd8b3eb..527940e8f50 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -228,6 +228,7 @@ struct DeviceDrawParams {
 };
 
 class Device {
+	friend class device_sub_ptr;
 protected:
 	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
 
@@ -237,6 +238,14 @@ protected:
 	/* used for real time display */
 	unsigned int vertex_buffer;
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
+	{
+		/* Only required for devices that implement denoising. */
+		assert(false);
+		return (device_ptr) 0;
+	}
+	virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
 public:
 	virtual ~Device();
 
@@ -265,6 +274,8 @@ public:
 	virtual void mem_zero(device_memory& mem) = 0;
 	virtual void mem_free(device_memory& mem) = 0;
 
+	virtual int mem_address_alignment() { return 16; }
+
 	/* constant memory */
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
@@ -312,6 +323,8 @@ public:
 	/* multi device */
 	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
 	virtual int device_number(Device * /*sub_device*/) { return 0; }
+	virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+	virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
 
 	/* static */
 	static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 84cce605182..1ecce8bd565 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -25,6 +25,7 @@
 #endif
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
@@ -34,6 +35,8 @@
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
 
+#include "kernel/filter/filter.h"
+
 #include "kernel/osl/osl_shader.h"
 #include "kernel/osl/osl_globals.h"
 
@@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN
 
 class CPUDevice;
 
-class CPUSplitKernel : public DeviceSplitKernel {
-	CPUDevice *device;
-public:
-	explicit CPUSplitKernel(CPUDevice *device);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs);
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
-	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
-class CPUDevice : public Device
-{
-	static unordered_map<string, void*> kernel_functions;
-
-	static void register_kernel_function(const char* name, void* func)
+template<typename F>
+class KernelFunctions {
+public:
+	KernelFunctions()
 	{
-		kernel_functions[name] = func;
+		kernel = (F)NULL;
 	}
 
-	static const char* get_arch_name()
+	KernelFunctions(F kernel_default,
+	                F kernel_sse2,
+	                F kernel_sse3,
+	                F kernel_sse41,
+	                F kernel_avx,
+	                F kernel_avx2)
 	{
+		const char *architecture_name = "default";
+		kernel = kernel_default;
+
+		/* Silence potential warnings about unused variables
+		 * when compiling without some architectures. */
+		(void)kernel_sse2;
+		(void)kernel_sse3;
+		(void)kernel_sse41;
+		(void)kernel_avx;
+		(void)kernel_avx2;
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2()) {
-			return "cpu_avx2";
+			architecture_name = "AVX2";
+			kernel = kernel_avx2;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
-			return "cpu_avx";
+			architecture_name = "AVX";
+			kernel = kernel_avx;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 		if(system_cpu_support_sse41()) {
-			return "cpu_sse41";
+			architecture_name = "SSE4.1";
+			kernel = kernel_sse41;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
-			return "cpu_sse3";
+			architecture_name = "SSE3";
+			kernel = kernel_sse3;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 		if(system_cpu_support_sse2()) {
-			return "cpu_sse2";
+			architecture_name = "SSE2";
+			kernel = kernel_sse2;
 		}
-		else
 #endif
-		{
-			return "cpu";
+
+		if(strstr(architecture_name, logged_architecture) != 0) {
+			VLOG(1) << "Will be using " << architecture_name << " kernels.";
+			logged_architecture = architecture_name;
 		}
 	}
 
-	template<typename F>
-	static F get_kernel_function(string name)
-	{
-		name = string("kernel_") + get_arch_name() + "_" + name;
-
-		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+	inline F operator()() const {
+		assert(kernel);
+		return kernel;
+	}
+protected:
+	F kernel;
+};
 
-		if(it == kernel_functions.end()) {
-			assert(!"kernel function not found");
-			return NULL;
-		}
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
 
-		return (F)it->second;
-	}
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
 
-	friend class CPUSplitKernel;
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
 
+class CPUDevice : public Device
+{
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -149,77 +168,89 @@ public:
 	bool use_split_kernel;
 
 	DeviceRequestedFeatures requested_features;
-	
+
+	KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)>   path_trace_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_half_float_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+
+	KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+	KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)>               filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_combine_halves_kernel;
+
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
+
+	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                                              filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                                       filter_finalize_kernel;
+
+	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+	                       ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
+	                       ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
+	unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+	      KERNEL_NAME_EVAL(cpu, name), \
+	      KERNEL_NAME_EVAL(cpu_sse2, name), \
+	      KERNEL_NAME_EVAL(cpu_sse3, name), \
+	      KERNEL_NAME_EVAL(cpu_sse41, name), \
+	      KERNEL_NAME_EVAL(cpu_avx, name), \
+	      KERNEL_NAME_EVAL(cpu_avx2, name)
+
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
-	: Device(info, stats, background)
+	: Device(info, stats, background),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+	  REGISTER_KERNEL(path_trace),
+	  REGISTER_KERNEL(convert_to_half_float),
+	  REGISTER_KERNEL(convert_to_byte),
+	  REGISTER_KERNEL(shader),
+	  REGISTER_KERNEL(filter_divide_shadow),
+	  REGISTER_KERNEL(filter_get_feature),
+	  REGISTER_KERNEL(filter_combine_halves),
+	  REGISTER_KERNEL(filter_nlm_calc_difference),
+	  REGISTER_KERNEL(filter_nlm_blur),
+	  REGISTER_KERNEL(filter_nlm_calc_weight),
+	  REGISTER_KERNEL(filter_nlm_update_output),
+	  REGISTER_KERNEL(filter_nlm_normalize),
+	  REGISTER_KERNEL(filter_construct_transform),
+	  REGISTER_KERNEL(filter_nlm_construct_gramian),
+	  REGISTER_KERNEL(filter_finalize),
+	  REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
 	{
 
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
-
-		/* do now to avoid thread issues */
-		system_cpu_support_sse2();
-		system_cpu_support_sse3();
-		system_cpu_support_sse41();
-		system_cpu_support_avx();
-		system_cpu_support_avx2();
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			VLOG(1) << "Will be using AVX2 kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			VLOG(1) << "Will be using AVX kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			VLOG(1) << "Will be using SSE4.1 kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			VLOG(1) << "Will be using SSE3kernels.";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			VLOG(1) << "Will be using SSE2 kernels.";
-		}
-		else
-#endif
-		{
-			VLOG(1) << "Will be using regular kernels.";
-		}
-
 		use_split_kernel = DebugFlags().cpu.split_kernel;
 		if(use_split_kernel) {
 			VLOG(1) << "Will be using split kernel.";
 		}
 
-		kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+		REGISTER_SPLIT_KERNEL(path_init);
+		REGISTER_SPLIT_KERNEL(scene_intersect);
+		REGISTER_SPLIT_KERNEL(lamp_emission);
+		REGISTER_SPLIT_KERNEL(do_volume);
+		REGISTER_SPLIT_KERNEL(queue_enqueue);
+		REGISTER_SPLIT_KERNEL(indirect_background);
+		REGISTER_SPLIT_KERNEL(shader_setup);
+		REGISTER_SPLIT_KERNEL(shader_sort);
+		REGISTER_SPLIT_KERNEL(shader_eval);
+		REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		REGISTER_SPLIT_KERNEL(subsurface_scatter);
+		REGISTER_SPLIT_KERNEL(direct_lighting);
+		REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+		REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+		REGISTER_SPLIT_KERNEL(next_iteration_setup);
+		REGISTER_SPLIT_KERNEL(indirect_subsurface);
+		REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
 	}
 
 	~CPUDevice()
@@ -273,13 +304,17 @@ public:
 			if(!mem.data_pointer) {
 				free((void*)mem.device_pointer);
 			}
-
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		kernel_const_copy(&kernel_globals, name, host, size);
@@ -326,13 +361,8 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
-			if(!use_split_kernel) {
-				thread_path_trace(*task);
-			}
-			else {
-				thread_path_trace_split(*task);
-			}
+		if(task->type == DeviceTask::RENDER) {
+			thread_render(*task);
 		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
@@ -349,116 +379,319 @@ public:
 		}
 	};
 
-	void thread_path_trace(DeviceTask& task)
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
 	{
-		if(task_pool.canceled()) {
-			if(task.need_finish_queue == false)
-				return;
+		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
 		}
 
-		KernelGlobals kg = thread_kernel_globals_init();
-		RenderTile tile;
+		return true;
+	}
 
-		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		int4 rect = task->rect;
+		int   r   = task->nlm_state.r;
+		int   f   = task->nlm_state.f;
+		float a   = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		int w = align_up(rect.z-rect.x, 4);
+		int h = rect.w-rect.y;
+
+		float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+		float *difference     = (float*) task->nlm_state.temporary_2_ptr;
+		float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
+
+		memset(weightAccum, 0, sizeof(float)*w*h);
+		memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) guide_ptr,
+			                                    (float*) variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    w, 0,
+			                                    a, k_2);
+
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
+
+			filter_nlm_update_output_kernel()(dx, dy,
+			                                  blurDifference,
+			                                  (float*) image_ptr,
+			                                  (float*) out_ptr,
+			                                  weightAccum,
+			                                  local_rect,
+			                                  w, f);
+		}
+
+		int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+		filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			path_trace_kernel = kernel_cpu_avx2_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			path_trace_kernel = kernel_cpu_avx_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			path_trace_kernel = kernel_cpu_sse41_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			path_trace_kernel = kernel_cpu_sse3_path_trace;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			path_trace_kernel = kernel_cpu_sse2_path_trace;
-		}
-		else
-#endif
-		{
-			path_trace_kernel = kernel_cpu_path_trace;
+		return true;
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		for(int y = 0; y < task->filter_area.w; y++) {
+			for(int x = 0; x < task->filter_area.z; x++) {
+				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+				                                    x + task->filter_area.x,
+				                                    y + task->filter_area.y,
+				                                    y*task->filter_area.z + x,
+				                                    (float*) task->storage.transform.device_pointer,
+				                                    (int*)   task->storage.rank.device_pointer,
+				                                    &task->rect.x,
+				                                    task->buffer.pass_stride,
+				                                    task->radius,
+				                                    task->pca_threshold);
+			}
 		}
+		return true;
+	}
 
-		while(task.acquire_tile(this, tile)) {
-			float *render_buffer = (float*)tile.buffer;
-			uint *rng_state = (uint*)tile.rng_state;
-			int start_sample = tile.start_sample;
-			int end_sample = tile.start_sample + tile.num_samples;
-
-			for(int sample = start_sample; sample < end_sample; sample++) {
-				if(task.get_cancel() || task_pool.canceled()) {
-					if(task.need_finish_queue == false)
-						break;
-				}
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr guide_ptr,
+	                           device_ptr guide_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		float *difference     = (float*) task->reconstruction_state.temporary_1_ptr;
+		float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+			filter_nlm_calc_difference_kernel()(dx, dy,
+			                                    (float*) guide_ptr,
+			                                    (float*) guide_variance_ptr,
+			                                    difference,
+			                                    local_rect,
+			                                    task->buffer.w,
+			                                    task->buffer.pass_stride,
+			                                    1.0f,
+			                                    task->nlm_k_2);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
+			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+			filter_nlm_construct_gramian_kernel()(dx, dy,
+			                                      blurDifference,
+			                                      (float*)  task->buffer.mem.device_pointer,
+			                                      (float*)  color_ptr,
+			                                      (float*)  color_variance_ptr,
+			                                      (float*)  task->storage.transform.device_pointer,
+			                                      (int*)    task->storage.rank.device_pointer,
+			                                      (float*)  task->storage.XtWX.device_pointer,
+			                                      (float3*) task->storage.XtWY.device_pointer,
+			                                      local_rect,
+			                                      &task->reconstruction_state.filter_rect.x,
+			                                      task->buffer.w,
+			                                      task->buffer.h,
+			                                      4,
+			                                      task->buffer.pass_stride);
+		}
+		for(int y = 0; y < task->filter_area.w; y++) {
+			for(int x = 0; x < task->filter_area.z; x++) {
+				filter_finalize_kernel()(x,
+				                         y,
+				                         y*task->filter_area.z + x,
+				                         task->buffer.w,
+				                         task->buffer.h,
+				                         (float*)  output_ptr,
+				                         (int*)    task->storage.rank.device_pointer,
+				                         (float*)  task->storage.XtWX.device_pointer,
+				                         (float3*) task->storage.XtWY.device_pointer,
+				                         &task->reconstruction_state.buffer_params.x,
+				                         task->render_buffer.samples);
+			}
+		}
+		return true;
+	}
 
-				for(int y = tile.y; y < tile.y + tile.h; y++) {
-					for(int x = tile.x; x < tile.x + tile.w; x++) {
-						path_trace_kernel(&kg, render_buffer, rng_state,
-						                  sample, x, y, tile.offset, tile.stride);
-					}
-				}
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask *task)
+	{
+		(void) task;
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x; x < rect.z; x++) {
+				filter_combine_halves_kernel()(x, y,
+				                               (float*) mean_ptr,
+				                               (float*) variance_ptr,
+				                               (float*) a_ptr,
+				                               (float*) b_ptr,
+				                               &rect.x,
+				                               r);
+			}
+		}
+		return true;
+	}
 
-				tile.sample = sample + 1;
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_divide_shadow_kernel()(task->render_buffer.samples,
+				                              task->tiles,
+				                              x, y,
+				                              (float*) a_ptr,
+				                              (float*) b_ptr,
+				                              (float*) sample_variance_ptr,
+				                              (float*) sv_variance_ptr,
+				                              (float*) buffer_variance_ptr,
+				                              &task->rect.x,
+				                              task->render_buffer.pass_stride,
+				                              task->render_buffer.denoising_data_offset,
+				                              use_split_kernel);
+			}
+		}
+		return true;
+	}
 
-				task.update_progress(&tile, tile.w*tile.h);
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		for(int y = task->rect.y; y < task->rect.w; y++) {
+			for(int x = task->rect.x; x < task->rect.z; x++) {
+				filter_get_feature_kernel()(task->render_buffer.samples,
+				                            task->tiles,
+				                            mean_offset,
+				                            variance_offset,
+				                            x, y,
+				                            (float*) mean_ptr,
+				                            (float*) variance_ptr,
+				                            &task->rect.x,
+				                            task->render_buffer.pass_stride,
+				                            task->render_buffer.denoising_data_offset,
+				                            use_split_kernel);
 			}
+		}
+		return true;
+	}
 
-			task.release_tile(tile);
+	void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+	{
+		float *render_buffer = (float*)tile.buffer;
+		uint *rng_state = (uint*)tile.rng_state;
+		int start_sample = tile.start_sample;
+		int end_sample = tile.start_sample + tile.num_samples;
 
-			if(task_pool.canceled()) {
+		for(int sample = start_sample; sample < end_sample; sample++) {
+			if(task.get_cancel() || task_pool.canceled()) {
 				if(task.need_finish_queue == false)
 					break;
 			}
+
+			for(int y = tile.y; y < tile.y + tile.h; y++) {
+				for(int x = tile.x; x < tile.x + tile.w; x++) {
+					path_trace_kernel()(kg, render_buffer, rng_state,
+					                    sample, x, y, tile.offset, tile.stride);
+				}
+			}
+
+			tile.sample = sample + 1;
+
+			task.update_progress(&tile, tile.w*tile.h);
 		}
+	}
+
+	void denoise(DeviceTask &task, RenderTile &tile)
+	{
+		tile.sample = tile.start_sample + tile.num_samples;
+
+		DenoisingTask denoising(this);
 
-		thread_kernel_globals_free(&kg);
+		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+		denoising.render_buffer.samples = tile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = tile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+
+		task.update_progress(&tile, tile.w*tile.h);
 	}
 
-	void thread_path_trace_split(DeviceTask& task)
+	void thread_render(DeviceTask& task)
 	{
 		if(task_pool.canceled()) {
 			if(task.need_finish_queue == false)
 				return;
 		}
 
-		RenderTile tile;
-
-		CPUSplitKernel split_kernel(this);
-
 		/* allocate buffer for kernel globals */
-		device_memory kgbuffer;
-		kgbuffer.resize(sizeof(KernelGlobals));
+		device_only_memory<KernelGlobals> kgbuffer;
+		kgbuffer.resize(1);
 		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
 
 		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
 
-		requested_features.max_closure = MAX_CLOSURE;
-		if(!split_kernel.load_kernels(requested_features)) {
-			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
-			mem_free(kgbuffer);
+		CPUSplitKernel *split_kernel = NULL;
+		if(use_split_kernel) {
+			split_kernel = new CPUSplitKernel(this);
+			requested_features.max_closure = MAX_CLOSURE;
+			if(!split_kernel->load_kernels(requested_features)) {
+				thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+				mem_free(kgbuffer);
 
-			return;
+				delete split_kernel;
+				return;
+			}
 		}
 
+		RenderTile tile;
 		while(task.acquire_tile(this, tile)) {
-			device_memory data;
-			split_kernel.path_trace(&task, tile, kgbuffer, data);
+			if(tile.task == RenderTile::PATH_TRACE) {
+				if(use_split_kernel) {
+					device_memory data;
+					split_kernel->path_trace(&task, tile, kgbuffer, data);
+				}
+				else {
+					path_trace(task, tile, kg);
+				}
+			}
+			else if(tile.task == RenderTile::DENOISE) {
+				denoise(task, tile);
+			}
 
 			task.release_tile(tile);
 
@@ -470,6 +703,7 @@ public:
 
 		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
 		mem_free(kgbuffer);
+		delete split_kernel;
 	}
 
 	void thread_film_convert(DeviceTask& task)
@@ -477,86 +711,16 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
-			void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
-			}
-			else
-#endif	
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
-			}
-			else
-#endif		
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
-			if(system_cpu_support_sse3()) {
-				convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
-			}
-			else
-#endif
-			{
-				convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
-			}
-
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-						sample_scale, x, y, task.offset, task.stride);
+					convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+					                               sample_scale, x, y, task.offset, task.stride);
 		}
 		else {
-			void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-			if(system_cpu_support_avx2()) {
-				convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-			if(system_cpu_support_avx()) {
-				convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
-			}
-			else
-#endif		
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-			if(system_cpu_support_sse41()) {
-				convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
-			}
-			else
-#endif			
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-			if(system_cpu_support_sse3()) {
-				convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
-			}
-			else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-			if(system_cpu_support_sse2()) {
-				convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
-			}
-			else
-#endif
-			{
-				convert_to_byte_kernel = kernel_cpu_convert_to_byte;
-			}
-
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-						sample_scale, x, y, task.offset, task.stride);
+					convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+					                         sample_scale, x, y, task.offset, task.stride);
 
 		}
 	}
@@ -568,53 +732,17 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			shader_kernel = kernel_cpu_avx2_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			shader_kernel = kernel_cpu_avx_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
-		if(system_cpu_support_sse41()) {
-			shader_kernel = kernel_cpu_sse41_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			shader_kernel = kernel_cpu_sse3_shader;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			shader_kernel = kernel_cpu_sse2_shader;
-		}
-		else
-#endif
-		{
-			shader_kernel = kernel_cpu_shader;
-		}
-
 		for(int sample = 0; sample < task.num_samples; sample++) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel(&kg,
-				              (uint4*)task.shader_input,
-				              (float4*)task.shader_output,
-				              (float*)task.shader_output_luma,
-				              task.shader_eval_type,
-				              task.shader_filter,
-				              x,
-				              task.offset,
-				              sample);
+				shader_kernel()(&kg,
+				                (uint4*)task.shader_input,
+				                (float4*)task.shader_output,
+				                (float*)task.shader_output_luma,
+				                task.shader_eval_type,
+				                task.shader_filter,
+				                x,
+				                task.offset,
+				                sample);
 
 			if(task.get_cancel() || task_pool.canceled())
 				break;
@@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
                                                     device_memory& use_queues_flags,
                                                     device_memory& work_pool_wgs)
 {
-	typedef void(*data_init_t)(KernelGlobals *kg,
-	                           ccl_constant KernelData *data,
-	                           ccl_global void *split_data_buffer,
-	                           int num_elements,
-	                           ccl_global char *ray_state,
-	                           ccl_global uint *rng_state,
-	                           int start_sample,
-	                           int end_sample,
-	                           int sx, int sy, int sw, int sh, int offset, int stride,
-	                           ccl_global int *Queue_index,
-	                           int queuesize,
-	                           ccl_global char *use_queues_flag,
-	                           ccl_global unsigned int *work_pool_wgs,
-	                           unsigned int num_samples,
-	                           ccl_global float *buffer);
-
-	data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-	if(system_cpu_support_avx2()) {
-		data_init = kernel_cpu_avx2_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-	if(system_cpu_support_avx()) {
-		data_init = kernel_cpu_avx_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-	if(system_cpu_support_sse41()) {
-		data_init = kernel_cpu_sse41_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-	if(system_cpu_support_sse3()) {
-		data_init = kernel_cpu_sse3_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-	if(system_cpu_support_sse2()) {
-		data_init = kernel_cpu_sse2_data_init;
-	}
-	else
-#endif
-	{
-		data_init = kernel_cpu_data_init;
-	}
-
 	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
 	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
 
@@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
 		for(int x = 0; x < dim.global_size[0]; x++) {
 			kg->global_id = make_int2(x, y);
 
-			data_init((KernelGlobals*)kernel_globals.device_pointer,
-			          (KernelData*)data.device_pointer,
-			          (void*)split_data.device_pointer,
-			          num_global_elements,
-			          (char*)ray_state.device_pointer,
-			          (uint*)rtile.rng_state,
-			          rtile.start_sample,
-			          rtile.start_sample + rtile.num_samples,
-			          rtile.x,
-			          rtile.y,
-			          rtile.w,
-			          rtile.h,
-			          rtile.offset,
-			          rtile.stride,
-			          (int*)queue_index.device_pointer,
-			          dim.global_size[0] * dim.global_size[1],
-			          (char*)use_queues_flags.device_pointer,
-			          (uint*)work_pool_wgs.device_pointer,
-			          rtile.num_samples,
-			          (float*)rtile.buffer);
+			device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+			                           (KernelData*)data.device_pointer,
+			                           (void*)split_data.device_pointer,
+			                           num_global_elements,
+			                           (char*)ray_state.device_pointer,
+			                           (uint*)rtile.rng_state,
+			                           rtile.start_sample,
+			                           rtile.start_sample + rtile.num_samples,
+			                           rtile.x,
+			                           rtile.y,
+			                           rtile.w,
+			                           rtile.h,
+			                           rtile.offset,
+			                           rtile.stride,
+			                           (int*)queue_index.device_pointer,
+			                           dim.global_size[0] * dim.global_size[1],
+			                           (char*)use_queues_flags.device_pointer,
+			                           (uint*)work_pool_wgs.device_pointer,
+			                           rtile.num_samples,
+			                           (float*)rtile.buffer);
 		}
 	}
 
@@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam
 {
 	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
 
-	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+	kernel->func = device->split_kernels[kernel_name]();
 	if(!kernel->func) {
 		delete kernel;
 		return NULL;
@@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
 	return split_data_buffer_size(kg, num_threads);
 }
 
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index a971170318e..968ee5bc487 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
 #include <string.h>
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
 #include "render/buffers.h"
 
+#include "kernel/filter/filter_defines.h"
+
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
@@ -113,7 +116,7 @@ public:
 	DedicatedTaskPool task_pool;
 	CUdevice cuDevice;
 	CUcontext cuContext;
-	CUmodule cuModule;
+	CUmodule cuModule, cuFilterModule;
 	map<device_ptr, bool> tex_interp_map;
 	map<device_ptr, uint> tex_bindless_map;
 	int cuDevId;
@@ -170,7 +173,7 @@ public:
 		CUresult result = stmt; \
 		\
 		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -301,7 +304,8 @@ public:
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features, bool split=false)
+	        const DeviceRequestedFeatures& requested_features,
+	        bool filter=false, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@@ -316,7 +320,7 @@ public:
 		                              machine,
 		                              cuda_version,
 		                              include_path.c_str());
-		if(use_adaptive_compilation()) {
+		if(!filter && use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
 		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -364,8 +368,22 @@ public:
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features,
+	                      bool filter=false, bool split=false)
 	{
+		const char *name, *source;
+		if(filter) {
+			name = "filter";
+			source = "filter.cu";
+		}
+		else if(split) {
+			name = "kernel_split";
+			source = "kernel_split.cu";
+		}
+		else {
+			name = "kernel";
+			source = "kernel.cu";
+		}
 		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -373,9 +391,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
-			                                                  : "lib/kernel_sm_%d%d.cubin",
-			                                            major, minor));
+			const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+			                                            name, major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
 				VLOG(1) << "Using precompiled kernel.";
@@ -384,7 +401,7 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features, split);
+		        compile_kernel_get_common_cflags(requested_features, filter, split);
 
 		/* Try to use locally compiled kernel. */
 		const string source_path = path_get("source");
@@ -395,9 +412,8 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
-		                                              : "cycles_kernel_sm%d%d_%s.cubin",
-		                                        major, minor,
+		const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+		                                        name, major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -432,7 +448,7 @@ public:
 		const string kernel = path_join(
 		        path_join(source_path, "kernel"),
 		        path_join("kernels",
-		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+		                  path_join("cuda", source)));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -480,11 +496,14 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features, use_split_kernel());
-
+		string cubin = compile_kernel(requested_features, false, use_split_kernel());
 		if(cubin == "")
 			return false;
 
+		string filter_cubin = compile_kernel(requested_features, true, false);
+		if(filter_cubin == "")
+			return false;
+
 		/* open module */
 		cuda_push_context();
 
@@ -499,6 +518,14 @@ public:
 		if(cuda_error_(result, "cuModuleLoad"))
 			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
 
+		if(path_read_text(filter_cubin, cubin_data))
+			result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+		else
+			result = CUDA_ERROR_FILE_NOT_FOUND;
+
+		if(cuda_error_(result, "cuModuleLoad"))
+			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
 		cuda_pop_context();
 
 		return (result == CUDA_SUCCESS);
@@ -581,6 +608,11 @@ public:
 		}
 	}
 
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		CUdeviceptr mem;
@@ -881,6 +913,368 @@ public:
 		}
 	}
 
+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+	{
+		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
+		}
+
+		mem_copy_to(task->tiles_mem);
+
+		return !have_error();
+	}
+
+#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int threads = (int)sqrt((float)threads_per_block);                                                  \
+			int xblocks = ((w) + threads - 1)/threads;                                                          \
+			int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args)                      \
+			cuda_assert(cuLaunchKernel(func,                \
+			                           xblocks, yblocks, 1, \
+			                           threads, threads, 1, \
+			                           0, 0, args, 0));
+
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		int4 rect = task->rect;
+		int w = rect.z-rect.x;
+		int h = rect.w-rect.y;
+		int r = task->nlm_state.r;
+		int f = task->nlm_state.f;
+		float a = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+		CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+		cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+		int dx, dy;
+		int4 local_rect;
+		int channel_offset = 0;
+		void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+		void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
+		void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
+		void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			dy = i / (2*r+1) - r;
+			dx = i % (2*r+1) - r;
+			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+		}
+
+		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+		void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterConstructTransform;
+		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+		CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+		                   task->storage.w,
+		                   task->storage.h);
+
+		void *args[] = {&task->buffer.mem.device_pointer,
+		                &task->storage.transform.device_pointer,
+		                &task->storage.rank.device_pointer,
+		                &task->filter_area,
+		                &task->rect,
+		                &task->radius,
+		                &task->pca_threshold,
+		                &task->buffer.pass_stride};
+		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr guide_ptr,
+	                           device_ptr guide_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		cuda_push_context();
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+		cuda_assert(cuModuleGetFunction(&cuFinalize,            cuFilterModule, "kernel_cuda_filter_finalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFinalize,            CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+		                   task->reconstruction_state.source_w,
+		                   task->reconstruction_state.source_h);
+
+		CUdeviceptr difference     = task->reconstruction_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		int f = 4;
+		float a = 1.0f;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+
+			void *calc_difference_args[] = {&dx, &dy,
+			                                &guide_ptr,
+			                                &guide_variance_ptr,
+			                                &difference,
+			                                &local_rect,
+			                                &task->buffer.w,
+			                                &task->buffer.pass_stride,
+			                                &a,
+			                                &task->nlm_k_2};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+			void *blur_args[] = {&difference,
+			                     &blurDifference,
+			                     &local_rect,
+			                     &task->buffer.w,
+			                     &f};
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *calc_weight_args[] = {&blurDifference,
+			                            &difference,
+			                            &local_rect,
+			                            &task->buffer.w,
+			                            &f};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+			/* Reuse previous arguments. */
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *construct_gramian_args[] = {&dx, &dy,
+			                                  &blurDifference,
+			                                  &task->buffer.mem.device_pointer,
+			                                  &color_ptr,
+			                                  &color_variance_ptr,
+			                                  &task->storage.transform.device_pointer,
+			                                  &task->storage.rank.device_pointer,
+			                                  &task->storage.XtWX.device_pointer,
+			                                  &task->storage.XtWY.device_pointer,
+			                                  &local_rect,
+			                                  &task->reconstruction_state.filter_rect,
+			                                  &task->buffer.w,
+			                                  &task->buffer.h,
+			                                  &f,
+		                                      &task->buffer.pass_stride};
+			CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+		}
+
+		void *finalize_args[] = {&task->buffer.w,
+		                         &task->buffer.h,
+		                         &output_ptr,
+				                 &task->storage.rank.device_pointer,
+				                 &task->storage.XtWX.device_pointer,
+				                 &task->storage.XtWY.device_pointer,
+				                 &task->filter_area,
+				                 &task->reconstruction_state.buffer_params.x,
+				                 &task->render_buffer.samples};
+		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask *task)
+	{
+		(void) task;
+
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterCombineHalves;
+		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&mean_ptr,
+		                &variance_ptr,
+		                &a_ptr,
+		                &b_ptr,
+		                &rect,
+		                &r};
+		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		(void) task;
+
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterDivideShadow;
+		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+		                &a_ptr,
+		                &b_ptr,
+		                &sample_variance_ptr,
+		                &sv_variance_ptr,
+		                &buffer_variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterGetFeature;
+		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+				        &mean_offset,
+				        &variance_offset,
+		                &mean_ptr,
+		                &variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	void denoise(RenderTile &rtile, const DeviceTask &task)
+	{
+		DenoisingTask denoising(this);
+
+		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+		denoising.render_buffer.samples = rtile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = rtile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+	}
+
 	void path_trace(RenderTile& rtile, int sample, bool branched)
 	{
 		if(have_error())
@@ -1305,7 +1699,7 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
+		if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
 			bool branched = task->integrator_branched;
@@ -1313,30 +1707,8 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
-			if(!use_split_kernel()) {
-				/* keep rendering tiles until done */
-				while(task->acquire_tile(this, tile)) {
-					int start_sample = tile.start_sample;
-					int end_sample = tile.start_sample + tile.num_samples;
-
-					for(int sample = start_sample; sample < end_sample; sample++) {
-						if(task->get_cancel()) {
-							if(task->need_finish_queue == false)
-								break;
-						}
-
-						path_trace(tile, sample, branched);
-
-						tile.sample = sample + 1;
-
-						task->update_progress(&tile, tile.w*tile.h);
-					}
-
-					task->release_tile(tile);
-				}
-			}
-			else {
-				DeviceRequestedFeatures requested_features;
+			DeviceRequestedFeatures requested_features;
+			if(use_split_kernel()) {
 				if(!use_adaptive_compilation()) {
 					requested_features.max_closure = 64;
 				}
@@ -1345,18 +1717,47 @@ public:
 					split_kernel = new CUDASplitKernel(this);
 					split_kernel->load_kernels(requested_features);
 				}
+			}
+
+			/* keep rendering tiles until done */
+			while(task->acquire_tile(this, tile)) {
+				if(tile.task == RenderTile::PATH_TRACE) {
+					if(use_split_kernel()) {
+						device_memory void_buffer;
+						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+					}
+					else {
+						int start_sample = tile.start_sample;
+						int end_sample = tile.start_sample + tile.num_samples;
+
+						for(int sample = start_sample; sample < end_sample; sample++) {
+							if(task->get_cancel()) {
+								if(task->need_finish_queue == false)
+									break;
+							}
 
-				while(task->acquire_tile(this, tile)) {
-					device_memory void_buffer;
-					split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+							path_trace(tile, sample, branched);
 
-					task->release_tile(tile);
+							tile.sample = sample + 1;
 
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
+							task->update_progress(&tile, tile.w*tile.h);
+						}
 					}
 				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+
+					denoise(tile, *task);
+
+					task->update_progress(&tile, tile.w*tile.h);
+				}
+
+				task->release_tile(tile);
+
+				if(task->get_cancel()) {
+					if(task->need_finish_queue == false)
+						break;
+				}
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
new file mode 100644
index 00000000000..39c8cf30105
--- /dev/null
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+	radius = task.denoising_radius;
+	nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+	if(task.denoising_relative_pca) {
+		pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+	}
+	else {
+		pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+	}
+
+	render_buffer.pass_stride = task.pass_stride;
+	render_buffer.denoising_data_offset  = task.pass_denoising_data;
+	render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+	rect = make_int4(max(tiles->x[0], filter_area.x - radius),
+	                 max(tiles->y[0], filter_area.y - radius),
+	                 min(tiles->x[3], filter_area.x + filter_area.z + radius),
+	                 min(tiles->y[3], filter_area.y + filter_area.w + radius));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+	tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
+
+	device_ptr buffers[9];
+	for(int i = 0; i < 9; i++) {
+		buffers[i] = rtiles[i].buffer;
+		tiles->offsets[i] = rtiles[i].offset;
+		tiles->strides[i] = rtiles[i].stride;
+	}
+	tiles->x[0] = rtiles[3].x;
+	tiles->x[1] = rtiles[4].x;
+	tiles->x[2] = rtiles[5].x;
+	tiles->x[3] = rtiles[5].x + rtiles[5].w;
+	tiles->y[0] = rtiles[1].y;
+	tiles->y[1] = rtiles[4].y;
+	tiles->y[2] = rtiles[7].y;
+	tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+	render_buffer.offset = rtiles[4].offset;
+	render_buffer.stride = rtiles[4].stride;
+	render_buffer.ptr    = rtiles[4].buffer;
+
+	functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+	/* Allocate denoising buffer. */
+	buffer.passes = 14;
+	buffer.w = align_up(rect.z - rect.x, 4);
+	buffer.h = rect.w - rect.y;
+	buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+	buffer.mem.resize(buffer.pass_stride * buffer.passes);
+	device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
+
+	device_ptr null_ptr = (device_ptr) 0;
+
+	/* Prefilter shadow feature. */
+	{
+		device_sub_ptr unfiltered_a   (device, buffer.mem, 0,                    buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr unfiltered_b   (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr sample_var     (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr buffer_var     (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr filtered_var   (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+		functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+		/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+		nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+		functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+		/* Reuse memory, the previous data isn't needed anymore. */
+		device_ptr filtered_a = *buffer_var,
+		           filtered_b = *sample_var;
+		/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+		nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+		functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+		functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+		device_ptr residual_var = *sample_var_var;
+		/* Estimate the residual variance between the two filtered halves. */
+		functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+		device_ptr final_a = *unfiltered_a,
+		           final_b = *unfiltered_b;
+		/* Use the residual variance for a second filter pass. */
+		nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+		functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+		functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+		/* Combine the two double-filtered halves to a final shadow feature. */
+		device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+	}
+
+	/* Prefilter general features. */
+	{
+		device_sub_ptr unfiltered     (device, buffer.mem,  8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr variance       (device, buffer.mem,  9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		int mean_from[]     = { 0, 1, 2, 6,  7,  8, 12 };
+		int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
+		int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7 };
+		for(int pass = 0; pass < 7; pass++) {
+			device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			/* Get the unfiltered pass and its variance from the RenderBuffers. */
+			functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+			/* Smooth the pass and store the result in the denoising buffers. */
+			nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+			functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+		}
+	}
+
+	/* Copy color passes. */
+	{
+		int mean_from[]     = {20, 21, 22};
+		int variance_from[] = {23, 24, 25};
+		int mean_to[]       = { 8,  9, 10};
+		int variance_to[]   = {11, 12, 13};
+		int num_color_passes = 3;
+		for(int pass = 0; pass < num_color_passes; pass++) {
+			device_sub_ptr color_pass    (device, buffer.mem,     mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+		}
+	}
+
+	storage.w = filter_area.z;
+	storage.h = filter_area.w;
+	storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
+	storage.rank.resize(storage.w*storage.h);
+	device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
+	device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
+
+	functions.construct_transform();
+
+	device_only_memory<float> temporary_1;
+	device_only_memory<float> temporary_2;
+	temporary_1.resize(buffer.w*buffer.h);
+	temporary_2.resize(buffer.w*buffer.h);
+	device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
+	device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
+	reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+	reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+	storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
+	storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
+	device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
+	device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
+
+	reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+	int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+	reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+	                                               render_buffer.stride,
+	                                               render_buffer.pass_stride,
+	                                               render_buffer.denoising_clean_offset);
+	reconstruction_state.source_w = rect.z-rect.x;
+	reconstruction_state.source_h = rect.w-rect.y;
+
+	{
+		device_sub_ptr color_ptr    (device, buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+		functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
+	}
+
+	device->mem_free(storage.XtWX);
+	device->mem_free(storage.XtWY);
+	device->mem_free(storage.transform);
+	device->mem_free(storage.rank);
+	device->mem_free(temporary_1);
+	device->mem_free(temporary_2);
+	device->mem_free(buffer.mem);
+	device->mem_free(tiles_mem);
+	return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
new file mode 100644
index 00000000000..86d8eb64386
--- /dev/null
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+	/* Parameters of the denoising algorithm. */
+	int radius;
+	float nlm_k_2;
+	float pca_threshold;
+
+	/* Pointer and parameters of the RenderBuffers. */
+	struct RenderBuffers {
+		int denoising_data_offset;
+		int denoising_clean_offset;
+		int pass_stride;
+		int offset;
+		int stride;
+		device_ptr ptr;
+		int samples;
+	} render_buffer;
+
+	TilesInfo *tiles;
+	device_vector<int> tiles_mem;
+	void tiles_from_rendertiles(RenderTile *rtiles);
+
+	int4 rect;
+	int4 filter_area;
+
+	struct DeviceFunctions {
+		function<bool(device_ptr image_ptr,    /* Contains the values that are smoothed. */
+		              device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
+		              device_ptr variance_ptr, /* Contains the variance of the guide image. */
+		              device_ptr out_ptr       /* The filtered output is written into this image. */
+		              )> non_local_means;
+		function<bool(device_ptr color_ptr,
+		              device_ptr color_variance_ptr,
+		              device_ptr guide_ptr,
+		              device_ptr guide_variance_ptr,
+		              device_ptr output_ptr
+		              )> reconstruct;
+		function<bool()> construct_transform;
+
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr,
+		              int r,
+		              int4 rect
+		              )> combine_halves;
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr sample_variance_ptr,
+		              device_ptr sv_variance_ptr,
+		              device_ptr buffer_variance_ptr
+		              )> divide_shadow;
+		function<bool(int mean_offset,
+		              int variance_offset,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr
+		              )> get_feature;
+		function<bool(device_ptr*)> set_tiles;
+	} functions;
+
+	/* Stores state of the current Reconstruction operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct ReconstructionState {
+		device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+
+		int4 filter_rect;
+		int4 buffer_params;
+
+		int source_w;
+		int source_h;
+	} reconstruction_state;
+
+	/* Stores state of the current NLM operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct NLMState {
+		device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+		device_ptr temporary_3_ptr;
+
+		int r;      /* Search radius of the filter. */
+		int f;      /* Patch size of the filter. */
+		float a;    /* Variance compensation factor in the MSE estimation. */
+		float k_2;  /* Squared value of the k parameter of the filter. */
+
+		void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+	} nlm_state;
+
+	struct Storage {
+		device_only_memory<float>  transform;
+		device_only_memory<int>    rank;
+		device_only_memory<float>  XtWX;
+		device_only_memory<float3> XtWY;
+		int w;
+		int h;
+	} storage;
+
+	DenoisingTask(Device *device) : device(device) {}
+
+	void init_from_devicetask(const DeviceTask &task);
+
+	bool run_denoising();
+
+	struct DenoiseBuffers {
+		int pass_stride;
+		int passes;
+		int w;
+		int h;
+		device_only_memory<float> mem;
+	} buffer;
+
+protected:
+	Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 4b10514a9d2..b63dd00068b 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -35,6 +35,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+class Device;
+
 enum MemoryType {
 	MEM_READ_ONLY,
 	MEM_WRITE_ONLY,
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
 
 template<> struct device_type_traits<float3> {
 	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 3;
+	static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<float4> {
@@ -173,6 +175,9 @@ class device_memory
 {
 public:
 	size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+	size_t memory_elements_size(int elements) {
+		return elements*data_elements*datatype_size(data_type);
+	}
 
 	/* data information */
 	DataType data_type;
@@ -213,6 +218,22 @@ protected:
 	device_memory& operator = (const device_memory&);
 };
 
+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+	device_only_memory()
+	{
+		data_type = device_type_traits<T>::data_type;
+		data_elements = max(device_type_traits<T>::num_elements, 1);
+	}
+
+	void resize(size_t num)
+	{
+		device_memory::resize(num*sizeof(T));
+	}
+};
+
 /* Device Vector */
 
 template<typename T> class device_vector : public device_memory
@@ -299,6 +320,27 @@ private:
 	array<T> data;
 };
 
+/* A device_sub_ptr is a pointer into another existing memory.
+ * Therefore, it is not allocated separately, but just created from the already allocated base memory.
+ * It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
+ * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
+class device_sub_ptr
+{
+public:
+	device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
+	~device_sub_ptr();
+	/* No copying. */
+	device_sub_ptr& operator = (const device_sub_ptr&);
+
+	device_ptr operator*() const
+	{
+		return ptr;
+	}
+protected:
+	Device *device;
+	device_ptr ptr;
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 624260a81c8..bc505b676fc 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -299,6 +299,60 @@ public:
 		return -1;
 	}
 
+	void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+			/* If the tile was rendered on another device, copy its memory to
+			 * to the current device now, for the duration of the denoising task.
+			 * Note that this temporarily modifies the RenderBuffers and calls
+			 * the device, so this function is not thread safe. */
+			if(tiles[i].buffers->device != sub_device) {
+				device_vector<float> &mem = tiles[i].buffers->buffer;
+
+				tiles[i].buffers->copy_from_device();
+				device_ptr original_ptr = mem.device_pointer;
+				mem.device_pointer = 0;
+				sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
+				sub_device->mem_copy_to(mem);
+				tiles[i].buffer = mem.device_pointer;
+				mem.device_pointer = original_ptr;
+			}
+		}
+	}
+
+	void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+			if(tiles[i].buffers->device != sub_device) {
+				device_vector<float> &mem = tiles[i].buffers->buffer;
+
+				device_ptr original_ptr = mem.device_pointer;
+				mem.device_pointer = tiles[i].buffer;
+
+				/* Copy denoised tile to the host. */
+				if(i == 4) {
+					tiles[i].buffers->copy_from_device(sub_device);
+				}
+
+				size_t mem_size = mem.device_size;
+				sub_device->mem_free(mem);
+				mem.device_pointer = original_ptr;
+				mem.device_size = mem_size;
+
+				/* Copy denoised tile to the original device. */
+				if(i == 4) {
+					tiles[i].buffers->device->mem_copy_to(mem);
+				}
+			}
+		}
+	}
+
 	int get_split_task_count(DeviceTask& task)
 	{
 		int total_tasks = 0;
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 9118793aad6..dddd19f179f 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
 
 		/* Allocate work_pool_wgs memory. */
-		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		work_pool_wgs.resize(max_work_groups);
 		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
 
-		queue_index.resize(NUM_QUEUES * sizeof(int));
+		queue_index.resize(NUM_QUEUES);
 		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
 
-		use_queues_flag.resize(sizeof(char));
+		use_queues_flag.resize(1);
 		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
 
 		ray_state.resize(num_global_elements);
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 58c2fdbb077..68c2ba974a5 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -80,16 +80,16 @@ private:
 	 */
 	device_memory split_data;
 	device_vector<uchar> ray_state;
-	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+	device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
 
 	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	device_memory use_queues_flag;
+	device_only_memory<char> use_queues_flag;
 
 	/* Approximate time it takes to complete one sample */
 	double avg_time_per_sample;
 
 	/* Work pool with respect to each work group. */
-	device_memory work_pool_wgs;
+	device_only_memory<unsigned int> work_pool_wgs;
 
 	/* clos_max value for which the kernels have been loaded currently. */
 	int current_max_closure;
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index ca303365627..3bc4c310283 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
 	if(type == SHADER) {
 		num = min(shader_w, num);
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 	}
 	else {
 		num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 			tasks.push_back(task);
 		}
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 		for(int i = 0; i < num; i++)
 			tasks.push_back(*this);
 	}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 
 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-	if((type != PATH_TRACE) &&
+	if((type != RENDER) &&
 	   (type != SHADER))
 		return;
 
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index feee89fd6e4..44a1efff1f5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -34,7 +34,7 @@ class Tile;
 
 class DeviceTask : public Task {
 public:
-	typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+	typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
 	Type type;
 
 	int x, y, w, h;
@@ -53,7 +53,7 @@ public:
 
 	int passes_size;
 
-	explicit DeviceTask(Type type = PATH_TRACE);
+	explicit DeviceTask(Type type = RENDER);
 
 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -65,6 +65,16 @@ public:
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
+	function<void(RenderTile*, Device*)> map_neighbor_tiles;
+	function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+	int pass_stride;
+	int pass_denoising_data;
+	int pass_denoising_clean;
 
 	bool need_finish_queue;
 	bool integrator_branched;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index d061973dcb7..a458ca6bf64 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -17,6 +17,7 @@
 #ifdef WITH_OPENCL
 
 #include "device/device.h"
+#include "device/device_denoising.h"
 
 #include "util/util_map.h"
 #include "util/util_param.h"
@@ -129,6 +130,8 @@ public:
 	                            cl_int* error = NULL);
 	static cl_device_type get_device_type(cl_device_id device_id);
 
+	static int mem_address_alignment(cl_device_id device_id);
+
 	/* Get somewhat more readable device name.
 	 * Main difference is AMD OpenCL here which only gives code name
 	 * for the regular device name. This will give more sane device
@@ -218,7 +221,7 @@ public:
 		cl_int err = stmt; \
 		\
 		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -282,7 +285,7 @@ public:
 		map<ustring, cl_kernel> kernels;
 	};
 
-	OpenCLProgram base_program;
+	OpenCLProgram base_program, denoising_program;
 
 	typedef map<string, device_vector<uchar>*> ConstMemMap;
 	typedef map<string, device_ptr> MemMap;
@@ -320,6 +323,9 @@ public:
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
 	void mem_free(device_memory& mem);
+
+	int mem_address_alignment();
+
 	void const_copy_to(const char *name, void *host, size_t size);
 	void tex_alloc(const char *name,
 	               device_memory& mem,
@@ -328,12 +334,14 @@ public:
 	void tex_free(device_memory& mem);
 
 	size_t global_size_round_up(int group_size, int global_size);
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);
 
+	void denoise(RenderTile& tile, const DeviceTask& task);
+
 	class OpenCLDeviceTask : public DeviceTask {
 	public:
 		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -367,9 +375,48 @@ public:
 
 	virtual void thread_run(DeviceTask * /*task*/) = 0;
 
+	virtual bool is_split_kernel() = 0;
+
 protected:
 	string kernel_build_options(const string *debug_src = NULL);
 
+	void mem_zero_kernel(device_ptr ptr, size_t size);
+
+	bool denoising_non_local_means(device_ptr image_ptr,
+	                               device_ptr guide_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr out_ptr,
+	                               DenoisingTask *task);
+	bool denoising_construct_transform(DenoisingTask *task);
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr guide_ptr,
+	                           device_ptr guide_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task);
+	bool denoising_combine_halves(device_ptr a_ptr,
+	                              device_ptr b_ptr,
+	                              device_ptr mean_ptr,
+	                              device_ptr variance_ptr,
+	                              int r, int4 rect,
+	                              DenoisingTask *task);
+	bool denoising_divide_shadow(device_ptr a_ptr,
+	                             device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr,
+	                             device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr,
+	                             DenoisingTask *task);
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task);
+	bool denoising_set_tiles(device_ptr *buffers,
+	                         DenoisingTask *task);
+
+	device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
+	void mem_free_sub_ptr(device_ptr ptr);
+
 	class ArgumentWrapper {
 	public:
 		ArgumentWrapper() : size(0), pointer(NULL)
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 22aeaddcde8..ae1a7b917c3 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("bake"));
 	base_program.add_kernel(ustring("zero_buffer"));
 
+	denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+	denoising_program.add_kernel(ustring("filter_divide_shadow"));
+	denoising_program.add_kernel(ustring("filter_get_feature"));
+	denoising_program.add_kernel(ustring("filter_combine_halves"));
+	denoising_program.add_kernel(ustring("filter_construct_transform"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+	denoising_program.add_kernel(ustring("filter_nlm_blur"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+	denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+	denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+	denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+	denoising_program.add_kernel(ustring("filter_finalize"));
+	denoising_program.add_kernel(ustring("filter_set_tiles"));
+
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
+	programs.push_back(&denoising_program);
 	/* Call actual class to fill the vector with its programs. */
 	if(!load_kernels(requested_features, programs)) {
 		return false;
@@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 	                                  NULL, NULL));
 }
 
-void OpenCLDeviceBase::mem_zero(device_memory& mem)
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
 {
-	if(mem.device_pointer) {
-		if(base_program.is_loaded()) {
-			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+	cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
 
-			size_t global_size[] = {1024, 1024};
-			size_t num_threads = global_size[0] * global_size[1];
+	size_t global_size[] = {1024, 1024};
+	size_t num_threads = global_size[0] * global_size[1];
 
-			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
-			cl_ulong d_offset = 0;
-			cl_ulong d_size = 0;
+	cl_mem d_buffer = CL_MEM_PTR(mem);
+	cl_ulong d_offset = 0;
+	cl_ulong d_size = 0;
 
-			while(d_offset < mem.memory_size()) {
-				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+	while(d_offset < size) {
+		d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
 
-				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+		kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
 
-				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
-				                               ckZeroBuffer,
-				                               2,
-				                               NULL,
-				                               global_size,
-				                               NULL,
-				                               0,
-				                               NULL,
-				                               NULL);
-				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+		ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+		                               ckZeroBuffer,
+		                               2,
+		                               NULL,
+		                               global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+		opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
 
-				d_offset += d_size;
-			}
+		d_offset += d_size;
+	}
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		if(base_program.is_loaded()) {
+			mem_zero_kernel(mem.device_pointer, mem.memory_size());
 		}
 
 		if(mem.data_pointer) {
@@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
 	}
 }
 
+int OpenCLDeviceBase::mem_address_alignment()
+{
+	return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+	cl_mem_flags mem_flag;
+	if(type == MEM_READ_ONLY)
+		mem_flag = CL_MEM_READ_ONLY;
+	else if(type == MEM_WRITE_ONLY)
+		mem_flag = CL_MEM_WRITE_ONLY;
+	else
+		mem_flag = CL_MEM_READ_WRITE;
+
+	cl_buffer_region info;
+	info.origin = mem.memory_elements_size(offset);
+	info.size = mem.memory_elements_size(size);
+
+	device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+	                                                    mem_flag,
+	                                                    CL_BUFFER_CREATE_TYPE_REGION,
+	                                                    &info,
+	                                                    &ciErr);
+	opencl_assert_err(ciErr, "clCreateSubBuffer");
+	return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+	if(device_pointer && device_pointer != null_mem) {
+		opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+	}
+}
+
 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 {
 	ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
 	return global_size + ((r == 0)? 0: group_size - r);
 }
 
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
 {
 	size_t workgroup_size, max_work_items[3];
 
@@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
 	clGetDeviceInfo(cdDevice,
 		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
 
+	if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+		workgroup_size = max_workgroup_size;
+	}
+
 	/* Try to divide evenly over 2 dimensions. */
 	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
 	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
 	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 }
 
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+                                                 device_ptr guide_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr out_ptr,
+                                                 DenoisingTask *task)
+{
+	int4 rect = task->rect;
+	int w = rect.z-rect.x;
+	int h = rect.w-rect.y;
+	int r = task->nlm_state.r;
+	int f = task->nlm_state.f;
+	float a = task->nlm_state.a;
+	float k_2 = task->nlm_state.k_2;
+
+	cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+	cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+	cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+	mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+	mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+	cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
+	cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
+
+	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+		int dy = i / (2*r+1) - r;
+		int dx = i % (2*r+1) - r;
+		int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+		kernel_set_args(ckNLMCalcDifference, 0,
+		                dx, dy, guide_mem, variance_mem,
+		                difference, local_rect, w, 0, a, k_2);
+		kernel_set_args(ckNLMBlur, 0,
+		                difference, blurDifference, local_rect, w, f);
+		kernel_set_args(ckNLMCalcWeight, 0,
+		                blurDifference, difference, local_rect, w, f);
+		kernel_set_args(ckNLMUpdateOutput, 0,
+		                dx, dy, blurDifference, image_mem,
+		                out_mem, weightAccum, local_rect, w, f);
+
+		enqueue_kernel(ckNLMCalcDifference, w, h);
+		enqueue_kernel(ckNLMBlur,           w, h);
+		enqueue_kernel(ckNLMCalcWeight,     w, h);
+		enqueue_kernel(ckNLMBlur,           w, h);
+		enqueue_kernel(ckNLMUpdateOutput,   w, h);
+	}
+
+	int4 local_rect = make_int4(0, 0, w, h);
+	kernel_set_args(ckNLMNormalize, 0,
+	                out_mem, weightAccum, local_rect, w);
+	enqueue_kernel(ckNLMNormalize, w, h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+	cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+	kernel_set_args(ckFilterConstructTransform, 0,
+	                buffer_mem,
+	                transform_mem,
+	                rank_mem,
+	                task->filter_area,
+	                task->rect,
+	                task->buffer.pass_stride,
+	                task->radius,
+	                task->pca_threshold);
+
+	enqueue_kernel(ckFilterConstructTransform,
+	               task->storage.w,
+	               task->storage.h,
+	               256);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+                                             device_ptr color_variance_ptr,
+                                             device_ptr guide_ptr,
+                                             device_ptr guide_variance_ptr,
+                                             device_ptr output_ptr,
+                                             DenoisingTask *task)
+{
+	mem_zero(task->storage.XtWX);
+	mem_zero(task->storage.XtWY);
+
+	cl_mem color_mem = CL_MEM_PTR(color_ptr);
+	cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+	cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+	cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+	cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+	cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+	cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
+
+	cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+	cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+	int r = task->radius;
+	int f = 4;
+	float a = 1.0f;
+	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+		int dy = i / (2*r+1) - r;
+		int dx = i % (2*r+1) - r;
+
+		int local_rect[4] = {max(0, -dx), max(0, -dy),
+		                     task->reconstruction_state.source_w - max(0, dx),
+		                     task->reconstruction_state.source_h - max(0, dy)};
+
+		kernel_set_args(ckNLMCalcDifference, 0,
+		                dx, dy,
+		                guide_mem,
+		                guide_variance_mem,
+		                difference,
+		                local_rect,
+		                task->buffer.w,
+		                task->buffer.pass_stride,
+		                a, task->nlm_k_2);
+		enqueue_kernel(ckNLMCalcDifference,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMBlur, 0,
+		                difference,
+		                blurDifference,
+		                local_rect,
+		                task->buffer.w,
+		                f);
+		enqueue_kernel(ckNLMBlur,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMCalcWeight, 0,
+		                blurDifference,
+		                difference,
+		                local_rect,
+		                task->buffer.w,
+		                f);
+		enqueue_kernel(ckNLMCalcWeight,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		/* Reuse previous arguments. */
+		enqueue_kernel(ckNLMBlur,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMConstructGramian, 0,
+		                dx, dy,
+		                blurDifference,
+		                buffer_mem,
+		                color_mem,
+		                color_variance_mem,
+		                transform_mem,
+		                rank_mem,
+		                XtWX_mem,
+		                XtWY_mem,
+		                local_rect,
+		                task->reconstruction_state.filter_rect,
+		                task->buffer.w,
+		                task->buffer.h,
+		                f,
+	                    task->buffer.pass_stride);
+		enqueue_kernel(ckNLMConstructGramian,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h,
+		               256);
+	}
+
+	kernel_set_args(ckFinalize, 0,
+	                task->buffer.w,
+	                task->buffer.h,
+	                output_mem,
+	                rank_mem,
+	                XtWX_mem,
+	                XtWY_mem,
+	                task->filter_area,
+	                task->reconstruction_state.buffer_params,
+	                task->render_buffer.samples);
+	enqueue_kernel(ckFinalize,
+	               task->reconstruction_state.source_w,
+	               task->reconstruction_state.source_h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+                                                device_ptr b_ptr,
+                                                device_ptr mean_ptr,
+                                                device_ptr variance_ptr,
+                                                int r, int4 rect,
+                                                DenoisingTask *task)
+{
+	(void) task;
+
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+	kernel_set_args(ckFilterCombineHalves, 0,
+	                mean_mem,
+	                variance_mem,
+	                a_mem,
+	                b_mem,
+	                rect,
+	                r);
+	enqueue_kernel(ckFilterCombineHalves,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+                                               device_ptr b_ptr,
+                                               device_ptr sample_variance_ptr,
+                                               device_ptr sv_variance_ptr,
+                                               device_ptr buffer_variance_ptr,
+                                               DenoisingTask *task)
+{
+	(void) task;
+
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+	cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+	cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+	char split_kernel = is_split_kernel()? 1 : 0;
+	kernel_set_args(ckFilterDivideShadow, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                a_mem,
+	                b_mem,
+	                sample_variance_mem,
+	                sv_variance_mem,
+	                buffer_variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset,
+	                split_kernel);
+	enqueue_kernel(ckFilterDivideShadow,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+                                             int variance_offset,
+                                             device_ptr mean_ptr,
+                                             device_ptr variance_ptr,
+                                             DenoisingTask *task)
+{
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+	char split_kernel = is_split_kernel()? 1 : 0;
+	kernel_set_args(ckFilterGetFeature, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                mean_offset,
+	                variance_offset,
+	                mean_mem,
+	                variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset,
+	                split_kernel);
+	enqueue_kernel(ckFilterGetFeature,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+                                           DenoisingTask *task)
+{
+	mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+	mem_copy_to(task->tiles_mem);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+	kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+	for(int i = 0; i < 9; i++) {
+		cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+		kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+	}
+
+	enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+	return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+	DenoisingTask denoising(this);
+
+	denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+	denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+	denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+	denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+
+	denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+	denoising.render_buffer.samples = rtile.sample;
+
+	RenderTile rtiles[9];
+	rtiles[4] = rtile;
+	task.map_neighbor_tiles(rtiles, this);
+	denoising.tiles_from_rendertiles(rtiles);
+
+	denoising.init_from_devicetask(task);
+
+	denoising.run_denoising();
+
+	task.unmap_neighbor_tiles(rtiles, this);
+}
+
 void OpenCLDeviceBase::shader(DeviceTask& task)
 {
 	/* cast arguments to cl types */
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index a2fd1d71156..06c15bcf401 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -108,41 +108,53 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+				if(tile.task == RenderTile::PATH_TRACE) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
+
+						path_trace(tile, sample);
 
-					path_trace(tile, sample);
+						tile.sample = sample + 1;
 
-					tile.sample = sample + 1;
+						task->update_progress(&tile, tile.w*tile.h);
+					}
 
+					/* Complete kernel execution before release tile */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, *task);
 					task->update_progress(&tile, tile.w*tile.h);
 				}
 
-				/* Complete kernel execution before release tile */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
 				task->release_tile(tile);
 			}
 		}
 	}
+
+	bool is_split_kernel()
+	{
+		return false;
+	}
 };
 
 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index d175aae137a..76dcbd6fc9a 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -104,7 +104,7 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
 			/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -127,21 +127,29 @@ public:
 
 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				split_kernel->path_trace(task,
-				                         tile,
-				                         kgbuffer,
-				                         *const_mem_map["__data"]);
-
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
+				if(tile.task == RenderTile::PATH_TRACE) {
+					assert(tile.task == RenderTile::PATH_TRACE);
+					split_kernel->path_trace(task,
+					                         tile,
+					                         kgbuffer,
+					                         *const_mem_map["__data"]);
+
+					/* Complete kernel execution before release tile. */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, *task);
+					task->update_progress(&tile, tile.w*tile.h);
+				}
 
 				task->release_tile(tile);
 			}
@@ -150,6 +158,11 @@ public:
 		}
 	}
 
+	bool is_split_kernel()
+	{
+		return true;
+	}
+
 protected:
 	/* ** Those guys are for workign around some compiler-specific bugs ** */
 
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 38003dd1e1e..642c1bfa11c 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	return get_device_name(device_id);
 }
 
+int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
+{
+	int base_align_bits;
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+	                   sizeof(int),
+	                   &base_align_bits,
+	                   NULL) == CL_SUCCESS)
+	{
+		return base_align_bits/8;
+	}
+	return 1;
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9bb0455b9d5..bef869f34b4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS
 
 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_sse2.cpp
+	kernels/cpu/kernel_sse3.cpp
+	kernels/cpu/kernel_sse41.cpp
+	kernels/cpu/kernel_avx.cpp
+	kernels/cpu/kernel_avx2.cpp
 	kernels/cpu/kernel_split.cpp
+	kernels/cpu/kernel_split_sse2.cpp
+	kernels/cpu/kernel_split_sse3.cpp
+	kernels/cpu/kernel_split_sse41.cpp
+	kernels/cpu/kernel_split_avx.cpp
+	kernels/cpu/kernel_split_avx2.cpp
+	kernels/cpu/filter.cpp
+	kernels/cpu/filter_sse2.cpp
+	kernels/cpu/filter_sse3.cpp
+	kernels/cpu/filter_sse41.cpp
+	kernels/cpu/filter_avx.cpp
+	kernels/cpu/filter_avx2.cpp
 	kernels/opencl/kernel.cl
 	kernels/opencl/kernel_state_buffer_size.cl
 	kernels/opencl/kernel_split.cl
@@ -32,8 +48,10 @@ set(SRC
 	kernels/opencl/kernel_next_iteration_setup.cl
 	kernels/opencl/kernel_indirect_subsurface.cl
 	kernels/opencl/kernel_buffer_update.cl
+	kernels/opencl/filter.cl
 	kernels/cuda/kernel.cu
 	kernels/cuda/kernel_split.cu
+	kernels/cuda/filter.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu.h
 	kernels/cpu/kernel_cpu_impl.h
 	kernels/cpu/kernel_cpu_image.h
+	kernels/cpu/filter_cpu.h
+	kernels/cpu/filter_cpu_impl.h
 )
 
 set(SRC_KERNELS_CUDA_HEADERS
@@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
 	geom/geom_volume.h
 )
 
+set(SRC_FILTER_HEADERS
+	filter/filter.h
+	filter/filter_defines.h
+	filter/filter_features.h
+	filter/filter_features_sse.h
+	filter/filter_kernel.h
+	filter/filter_nlm_cpu.h
+	filter/filter_nlm_gpu.h
+	filter/filter_prefilter.h
+	filter/filter_reconstruction.h
+	filter/filter_transform.h
+	filter/filter_transform_gpu.h
+	filter/filter_transform_sse.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
@@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_math_int2.h
 	../util/util_math_int3.h
 	../util/util_math_int4.h
+	../util/util_math_matrix.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
@@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		${SRC_CLOSURE_HEADERS}
 		${SRC_UTIL_HEADERS}
 	)
+	set(cuda_filter_sources kernels/cuda/filter.cu
+		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
+		${SRC_FILTER_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
-		if(${split})
-			set(cuda_extra_flags "-D__SPLIT__")
-			set(cuda_cubin kernel_split)
-		else()
-			set(cuda_extra_flags "")
-			set(cuda_cubin kernel)
-		endif()
-
+	macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
 		if(${experimental})
-			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
-			set(cuda_cubin ${cuda_cubin}_experimental)
+			set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+			set(name ${name}_experimental)
 		endif()
 
-		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+		set(cuda_cubin ${name}_${arch}.cubin)
 
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
-		if(split)
-			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
-		else()
-			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
-		endif()
+		set(cuda_kernel_src "/kernels/cuda/${name}.cu")
 
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
@@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					${cuda_arch_flags}
 					${cuda_version_flags}
 					${cuda_math_flags}
-					${cuda_extra_flags}
+					${flags}
 					${cuda_debug_flags}
 					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
-			DEPENDS ${cuda_sources})
+			DEPENDS ${sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
 		list(APPEND cuda_cubins ${cuda_cubin})
@@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
 
 		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
 			# Compile split kernel
-			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
 		endif()
 	endforeach()
 
@@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
 
 set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
 if(CXX_HAS_SSE)
-	list(APPEND SRC
-		kernels/cpu/kernel_sse2.cpp
-		kernels/cpu/kernel_sse3.cpp
-		kernels/cpu/kernel_sse41.cpp
-		kernels/cpu/kernel_split_sse2.cpp
-		kernels/cpu/kernel_split_sse3.cpp
-		kernels/cpu/kernel_split_sse41.cpp
-	)
-
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx.cpp
-		kernels/cpu/kernel_split_avx.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx2.cpp
-		kernels/cpu/kernel_split_avx2.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
@@ -432,6 +452,7 @@ add_library(cycles_kernel
 	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
+	${SRC_FILTER_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
 	${SRC_SPLIT_HEADERS}
@@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index a6bba8bf74d..a04c157dc40 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 #endif
 }
 
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+	if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+		return true;
+	}
+
+	if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+		return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+	}
+
+	return false;
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
 
 	float sigma;
 	float invsigma2;
-	float3 N;
 } VelvetBsdf;
 
 ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct DiffuseBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } DiffuseBsdf;
 
 /* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct DiffuseRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float3 *colors;
 } DiffuseRampBsdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 3fe7572e4ce..30cc8b90330 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
 	float alpha_x, alpha_y, ior;
 	MicrofacetExtra *extra;
 	float3 T;
-	float3 N;
 } MicrofacetBsdf;
 
 /* Beckmann and GGX microfacet importance sampling. */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 57f1e733ee7..30644946840 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
 ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
-	if(cosI > 0.9999f || cosI < 1e-6f) {
+	if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
 		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
 		const float phi = M_2PI_F * randU.y;
 		return make_float2(r*cosf(phi), r*sinf(phi));
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct OrenNayarBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float roughness;
 	float a;
 	float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct PhongRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float exponent;
 	float3 *colors;
 } PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 8a116693bdb..215c32e1ffb 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
 	SHADER_CLOSURE_BASE;
 
 	float roughness;
-	float3 N;
 } PrincipledDiffuseBsdf;
 
 ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 58df4f7ddbb..f4476bfecd0 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledSheenBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } PrincipledSheenBsdf;
 
 ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct ToonBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float size;
 	float smooth;
 } ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index f9236a6e52c..f733ea4c517 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
 	float texture_blur;
 	float albedo;
 	float roughness;
-	float3 N;
 } Bssrdf;
 
 /* Planar Truncated Gaussian
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name)  KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE      (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+	int offsets[9];
+	int strides[9];
+	int x[4];
+	int y[4];
+	/* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+	ccl_global float *buffers[9];
+#else
+	long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..f5a40d49997
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW     } \
+                                 pixel_buffer += buffer_w - (high.x - low.x); \
+                             }
+
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
+{
+	features[0] = pixel.x;
+	features[1] = pixel.y;
+	features[2] = ccl_get_feature(buffer, 0);
+	features[3] = ccl_get_feature(buffer, 1);
+	features[4] = ccl_get_feature(buffer, 2);
+	features[5] = ccl_get_feature(buffer, 3);
+	features[6] = ccl_get_feature(buffer, 4);
+	features[7] = ccl_get_feature(buffer, 5);
+	features[8] = ccl_get_feature(buffer, 6);
+	features[9] = ccl_get_feature(buffer, 7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] -= mean[i];
+	}
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
+{
+	scales[0] = fabsf(pixel.x - mean[0]);
+	scales[1] = fabsf(pixel.y - mean[1]);
+	scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+	                                    ccl_get_feature(buffer, 2) - mean[4],
+	                                    ccl_get_feature(buffer, 3) - mean[5]));
+	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+	                                    ccl_get_feature(buffer, 6) - mean[8],
+	                                    ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+	scale[0] = 1.0f/max(scale[0], 0.01f);
+	scale[1] = 1.0f/max(scale[1], 0.01f);
+	scale[2] = 1.0f/max(scale[2], 0.01f);
+	scale[6] = 1.0f/max(scale[4], 0.01f);
+	scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+	return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
+}
+
+ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+	return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+                                      int rank,
+                                      ccl_global float ccl_restrict_ptr transform,
+                                      int stride,
+                                      int row,
+                                      float feature)
+{
+	for(int i = 0; i < rank; i++) {
+		design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+	}
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+                                                       ccl_global float ccl_restrict_ptr p_buffer,
+                                                       int2 q_pixel,
+                                                       ccl_global float ccl_restrict_ptr q_buffer,
+                                                       int pass_stride,
+                                                       int rank,
+                                                       float *design_row,
+                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       int stride)
+{
+	design_row[0] = 1.0f;
+	math_vector_zero(design_row+1, rank);
+	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+	design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+	design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+	design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+	design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+	design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..303c8f482e3
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE     } \
+                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+                                 }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+	features[0] = x;
+	features[1] = y;
+	features[2] = ccl_get_feature_sse(0);
+	features[3] = ccl_get_feature_sse(1);
+	features[4] = ccl_get_feature_sse(2);
+	features[5] = ccl_get_feature_sse(3);
+	features[6] = ccl_get_feature_sse(4);
+	features[7] = ccl_get_feature_sse(5);
+	features[8] = ccl_get_feature_sse(6);
+	features[9] = ccl_get_feature_sse(7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] = _mm_sub_ps(features[i], mean[i]);
+	}
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
+
+	__m128 diff, scale;
+	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+	scale = _mm_mul_ps(diff, diff);
+	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	scales[3] = _mm_mask_ps(scale, active_pixels);
+
+	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+	scale = _mm_mul_ps(diff, diff);
+	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+#  include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+#  include "kernel/filter/filter_transform_gpu.h"
+#else
+#  ifdef __KERNEL_SSE3__
+#    include "kernel/filter/filter_transform_sse.h"
+#  else
+#    include "kernel/filter/filter_transform.h"
+#  endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+#  include "kernel/filter/filter_nlm_cpu.h"
+#else
+#  include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..1a314b100be
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			float diff = 0.0f;
+			int numChannels = channel_offset? 3 : 1;
+			for(int c = 0; c < numChannels; c++) {
+				float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+				float pvar = varianceImage[c*channel_offset + y*w+x];
+				float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+			}
+			if(numChannels > 1) {
+				diff *= 1.0f/numChannels;
+			}
+			differenceImage[y*w+x] = diff;
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+#ifdef __KERNEL_SSE3__
+	int aligned_lowx = (rect.x & ~(3));
+	int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+	for(int y = rect.y; y < rect.w; y++) {
+		const int low = max(rect.y, y-f);
+		const int high = min(rect.w, y+f+1);
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] = 0.0f;
+		}
+		for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+				_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
+			}
+#else
+			for(int x = rect.x; x < rect.z; x++) {
+				outImage[y*w+x] += differenceImage[y1*w+x];
+			}
+#endif
+		}
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] *= 1.0f/(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] = 0.0f;
+		}
+	}
+	for(int dx = -f; dx <= f; dx++) {
+		int pos_dx = max(0, dx);
+		int neg_dx = min(0, dx);
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+				outImage[y*w+x] += differenceImage[y*w+dx+x];
+			}
+		}
+	}
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += differenceImage[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+			accumImage[y*w+x] += weight;
+			outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+                                                           float ccl_restrict_ptr differenceImage,
+                                                           float ccl_restrict_ptr buffer,
+                                                           float *color_pass,
+                                                           float *variance_pass,
+                                                           float *transform,
+                                                           int *rank,
+                                                           float *XtWX,
+                                                           float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride)
+{
+	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+	for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+		int y = fy + filter_rect.y;
+		for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+			int x = fx + filter_rect.x;
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += differenceImage[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+
+			int storage_ofs = fy*filter_rect.z + fx;
+			float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+			float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+			int    *l_rank = rank + storage_ofs;
+
+			kernel_filter_construct_gramian(x, y, 1,
+			                                dx, dy, w, h,
+			                                pass_stride,
+			                                buffer,
+			                                color_pass, variance_pass,
+			                                l_transform, l_rank,
+			                                weight, l_XtWX, l_XtWY, 0);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] /= accumImage[y*w+x];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..b5ba7cf51a5
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+                                                         int dx, int dy,
+                                                         ccl_global float ccl_restrict_ptr weightImage,
+                                                         ccl_global float ccl_restrict_ptr varianceImage,
+                                                         ccl_global float *differenceImage,
+                                                         int4 rect, int w,
+                                                         int channel_offset,
+                                                         float a, float k_2)
+{
+	float diff = 0.0f;
+	int numChannels = channel_offset? 3 : 1;
+	for(int c = 0; c < numChannels; c++) {
+		float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+		float pvar = varianceImage[c*channel_offset + y*w+x];
+		float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+	}
+	if(numChannels > 1) {
+		diff *= 1.0f/numChannels;
+	}
+	differenceImage[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+                                              ccl_global float ccl_restrict_ptr differenceImage,
+                                              ccl_global float *outImage,
+                                              int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.y, y-f);
+	const int high = min(rect.w, y+f+1);
+	for(int y1 = low; y1 < high; y1++) {
+		sum += differenceImage[y1*w+x];
+	}
+	sum *= 1.0f/(high-low);
+	outImage[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+                                                     ccl_global float ccl_restrict_ptr differenceImage,
+                                                     ccl_global float *outImage,
+                                                     int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += differenceImage[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	outImage[y*w+x] = expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+                                                       int dx, int dy,
+                                                       ccl_global float ccl_restrict_ptr differenceImage,
+                                                       ccl_global float ccl_restrict_ptr image,
+                                                       ccl_global float *outImage,
+                                                       ccl_global float *accumImage,
+                                                       int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += differenceImage[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	if(outImage) {
+		accumImage[y*w+x] += sum;
+		outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+	}
+	else {
+		accumImage[y*w+x] = sum;
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+                                                           int dx, int dy,
+                                                           ccl_global float ccl_restrict_ptr differenceImage,
+                                                           ccl_global float ccl_restrict_ptr buffer,
+                                                           ccl_global float *color_pass,
+                                                           ccl_global float *variance_pass,
+                                                           ccl_global float ccl_restrict_ptr transform,
+                                                           ccl_global int *rank,
+                                                           ccl_global float *XtWX,
+                                                           ccl_global float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride,
+                                                           int localIdx)
+{
+	int y = fy + filter_rect.y;
+	int x = fx + filter_rect.x;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	float sum = 0.0f;
+	for(int x1 = low; x1 < high; x1++) {
+		sum += differenceImage[y*w+x1];
+	}
+	float weight = sum * (1.0f/(high - low));
+
+	int storage_ofs = fy*filter_rect.z + fx;
+	transform += storage_ofs;
+	rank += storage_ofs;
+	XtWX += storage_ofs;
+	XtWY += storage_ofs;
+
+	kernel_filter_construct_gramian(x, y,
+	                                filter_rect.z*filter_rect.w,
+	                                dx, dy, w, h,
+	                                pass_stride,
+	                                buffer,
+	                                color_pass, variance_pass,
+	                                transform, rank,
+	                                weight, XtWX, XtWY,
+	                                localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+                                                   ccl_global float *outImage,
+                                                   ccl_global float ccl_restrict_ptr accumImage,
+                                                   int4 rect, int w)
+{
+	outImage[y*w+x] /= accumImage[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..54bcf888052
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int x, int y,
+                                            ccl_global float *unfilteredA,
+                                            ccl_global float *unfilteredB,
+                                            ccl_global float *sampleVariance,
+                                            ccl_global float *sampleVarianceV,
+                                            ccl_global float *bufferVariance,
+                                            int4 rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            bool use_split_variance)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+
+	int offset = tiles->offsets[tile];
+	int stride = tiles->strides[tile];
+	ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
+	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+	center_buffer += buffer_denoising_offset + 14;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+	unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+	unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+	float varA = center_buffer[2];
+	float varB = center_buffer[5];
+	int odd_sample = (sample+1)/2;
+	int even_sample = sample/2;
+	if(use_split_variance) {
+		varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+		varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+	}
+	varA /= (odd_sample - 1);
+	varB /= (even_sample - 1);
+
+	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
+	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+	bufferVariance[idx]  = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+                                          ccl_global TilesInfo *tiles,
+                                          int m_offset, int v_offset,
+                                          int x, int y,
+                                          ccl_global float *mean,
+                                          ccl_global float *variance,
+                                          int4 rect, int buffer_pass_stride,
+                                          int buffer_denoising_offset,
+                                          bool use_split_variance)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+	ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	mean[idx] = center_buffer[m_offset] / sample;
+	if(use_split_variance) {
+		variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+	}
+	else {
+		variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+	}
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+                                             ccl_global float *mean,
+                                             ccl_global float *variance,
+                                             ccl_global float *a,
+                                             ccl_global float *b,
+                                             int4 rect, int r)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
+	if(variance) {
+		if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+		else {
+			variance[idx] = 0.0f;
+			float values[25];
+			int numValues = 0;
+			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+					values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+				}
+			}
+			/* Insertion-sort the variances (fast enough for 25 elements). */
+			for(int i = 1; i < numValues; i++) {
+				float v = values[i];
+				int j;
+				for(j = i-1; j >= 0 && values[j] > v; j--)
+					values[j+1] = values[j];
+				values[j+1] = v;
+			}
+			variance[idx] = values[(7*numValues)/8];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..02f3802fa0c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+                                                       int storage_stride,
+                                                       int dx, int dy,
+                                                       int w, int h,
+                                                       int pass_stride,
+                                                       ccl_global float ccl_restrict_ptr buffer,
+                                                       ccl_global float *color_pass,
+                                                       ccl_global float *variance_pass,
+                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       ccl_global int *rank,
+                                                       float weight,
+                                                       ccl_global float *XtWX,
+                                                       ccl_global float3 *XtWY,
+                                                       int localIdx)
+{
+	int p_offset =  y    *w +  x;
+	int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_CPU__
+	const int stride = 1;
+	(void)storage_stride;
+	(void)localIdx;
+	float design_row[DENOISE_FEATURES+1];
+#elif defined(__KERNEL_CUDA__)
+	const int stride = storage_stride;
+	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+	const int stride = storage_stride;
+	float design_row[DENOISE_FEATURES+1];
+#endif
+
+	float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
+	float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+
+	float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
+	float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
+
+	if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+		return;
+	}
+
+	filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
+	                                make_int2(x+dx, y+dy), buffer + q_offset,
+	                                pass_stride, *rank, design_row, transform, stride);
+
+	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+                                              ccl_global float *buffer,
+                                              ccl_global int *rank,
+                                              int storage_stride,
+                                              ccl_global float *XtWX,
+                                              ccl_global float3 *XtWY,
+                                              int4 buffer_params,
+                                              int sample)
+{
+#ifdef __KERNEL_CPU__
+	const int stride = 1;
+	(void)storage_stride;
+#else
+	const int stride = storage_stride;
+#endif
+
+	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+	float3 final_color = XtWY[0];
+
+	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+	final_color *= sample;
+	if(buffer_params.w) {
+		final_color.x += combined_buffer[buffer_params.w+0];
+		final_color.y += combined_buffer[buffer_params.w+1];
+		final_color.z += combined_buffer[buffer_params.w+2];
+	}
+	combined_buffer[0] = final_color.x;
+	combined_buffer[1] = final_color.y;
+	combined_buffer[2] = final_color.z;
+}
+
+#undef STORAGE_TYPE
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..139dc402d21
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float features[DENOISE_FEATURES];
+
+	/* Temporary storage, used in different steps of the algorithm. */
+	float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	float tempvector[2*DENOISE_FEATURES];
+	float ccl_restrict_ptr pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+	math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float *feature_scale = tempvector;
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float* feature_matrix = tempmatrix;
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	*rank = 0;
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < (*rank); i++) {
+		math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..f7414aeed8a
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  ccl_global float *transform,
+                                                  ccl_global int *rank,
+                                                  int radius, float pca_threshold,
+                                                  int transform_stride, int localIdx)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+	float features[DENOISE_FEATURES];
+#endif
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	ccl_global float ccl_restrict_ptr pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+	math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float feature_scale[DENOISE_FEATURES];
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+	*rank = 0;
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		for(int j = 0; j < (*rank); j++) {
+			transform[i*DENOISE_FEATURES + j] *= feature_scale[i];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..846f3ab3afa
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	__m128 features[DENOISE_FEATURES];
+	float ccl_restrict_ptr pixel_buffer;
+	int2 pixel;
+
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+
+	__m128 feature_means[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+	}
+
+	__m128 feature_scale[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	filter_calculate_scale_sse(feature_scale);
+
+	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+	*rank = 0;
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+			/* Bake the feature scaling into the transformation matrix. */
+			for(int j = 0; j < DENOISE_FEATURES; j++) {
+				transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
+			}
+		}
+	}
+	else {
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+			/* Bake the feature scaling into the transformation matrix. */
+			for(int j = 0; j < DENOISE_FEATURES; j++) {
+				transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
+			}
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 47778553b94..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Interpolate smooth vertex normal from vertices */
 
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
 	/* load triangle vertices */
 	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
 	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 
-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+	float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+	return is_zero(N)? Ng: N;
 }
 
 /* Ray differentials on triangle */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 06c0fb2fbca..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
 #define KERNEL_ARCH cpu
 #include "kernel/kernels/cpu/kernel_cpu.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 823d30dde78..06728415c15 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
 	L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_depth = 0.0f;
+#endif  /* __DENOISING_FEATURES__ */
 }
 
 ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 }
 
 ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              ccl_addr_space PathState *state,
                                               float3 throughput,
                                               float3 alpha,
                                               float3 bsdf,
-                                              float3 ao,
-                                              int bounce)
+                                              float3 ao)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf*ao;
 			L->ao += alpha*throughput*ao;
@@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
 	}
 
 #ifdef __SHADOW_TRICKS__
-	float3 light = throughput * bsdf;
-	L->path_total += light;
-	L->path_total_shaded += ao * light;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf;
+		L->path_total += light;
+		L->path_total_shaded += ao * light;
+	}
 #endif
 }
 
 ccl_device_inline void path_radiance_accum_total_ao(
         PathRadiance *L,
+        ccl_addr_space PathState *state,
         float3 throughput,
         float3 bsdf)
 {
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * bsdf;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf;
+	}
 #else
 	(void) L;
+	(void) state;
 	(void) throughput;
 	(void) bsdf;
 #endif
 }
 
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+                                                 ccl_addr_space PathState *state,
+                                                 float3 throughput,
+                                                 BsdfEval *bsdf_eval,
+                                                 float3 shadow,
+                                                 float shadow_fac,
+                                                 bool is_lamp)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	}
 
 #ifdef __SHADOW_TRICKS__
-	float3 light = throughput * bsdf_eval->sum_no_mis;
-	L->path_total += light;
-	L->path_total_shaded += shadow * light;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf_eval->sum_no_mis;
+		L->path_total += light;
+		L->path_total_shaded += shadow * light;
+	}
 #endif
 }
 
 ccl_device_inline void path_radiance_accum_total_light(
         PathRadiance *L,
+        ccl_addr_space PathState *state,
         float3 throughput,
         const BsdfEval *bsdf_eval)
 {
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * bsdf_eval->sum_no_mis;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf_eval->sum_no_mis;
+	}
 #else
 	(void) L;
+	(void) state;
 	(void) throughput;
 	(void) bsdf_eval;
 #endif
@@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
 	}
 
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * value;
-	if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
-		L->path_total_shaded += throughput * value;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * value;
+		if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+			L->path_total_shaded += throughput * value;
+		}
 	}
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_albedo += state->denoising_feature_weight * value;
+#endif  /* __DENOISING_FEATURES__ */
 }
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 	return L_sum;
 }
 
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+	kernel_assert(L->use_light_pass);
+
+	*clean = L->emission + L->background;
+	*noisy = L->direct_scatter + L->indirect_scatter;
+
+#  define ADD_COMPONENT(flag, component)     \
+	if(kernel_data.film.denoising_flags & flag) \
+		*clean += component;                 \
+	else                                     \
+		*noisy += component;
+
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR,      L->direct_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND,      L->indirect_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR,       L->direct_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND,       L->indirect_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR,   L->direct_subsurface);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND,   L->indirect_subsurface);
+#  undef ADD_COMPONENT
+#else
+	*noisy = L->emission;
+	*clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+	*noisy = ensure_finite3(*noisy);
+	*clean = ensure_finite3(*clean);
+}
+
 ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
 {
 	float fac = 1.0f/num_samples;
@@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
 /* Calculate final light sum and transparency for shadow catcher object. */
 ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
                                                          const PathRadiance *L,
-                                                         ccl_addr_space float* L_transparent)
+                                                         float* alpha)
 {
 	const float shadow = path_radiance_sum_shadow(L);
 	float3 L_sum;
 	if(kernel_data.background.transparent) {
-		*L_transparent = shadow;
+		*alpha = 1.0f-shadow;
 		L_sum = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else {
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 21da180bb8e..7595e74e2d5 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -42,6 +42,8 @@
 #include "util/util_types.h"
 #include "util/util_texture.h"
 
+#define ccl_restrict_ptr const * __restrict
+
 #define ccl_addr_space
 
 #define ccl_local_id(d) 0
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index c375d17a95f..80d7401fbcf 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,10 @@
 #define ccl_restrict __restrict__
 #define ccl_align(n) __align__(n)
 
+#define ccl_restrict_ptr const * __restrict__
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index c2263ac0d49..15cf4b81b21 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
 #  define ccl_addr_space
 #endif
 
+#define ccl_restrict_ptr const * __restrict__
+
 #define ccl_local_id(d) get_local_id(d)
 #define ccl_global_id(d) get_global_id(d)
 
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..9baa9d54957 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
 		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
 		cu = clamp(cu, -1.0f, 1.0f);
 		/* Compute xu. */
-		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
 		xu = clamp(xu, x0, x1);
 		/* Compute yv. */
 		float z0sq = z0 * z0;
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index ed523696571..8ab4c724829 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 #endif  /* __SPLIT_KERNEL__ */
 }
 
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+	kernel_write_pass_float(buffer, sample, value);
+
+	/* The online one-pass variance update that's used for the megakernel can't easily be implemented
+	 * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float(buffer+1, sample, value*value);
+#  else
+	if(sample == 0) {
+		kernel_write_pass_float(buffer+1, sample, 0.0f);
+	}
+	else {
+		float new_mean = buffer[0] * (1.0f / (sample + 1));
+		float old_mean = (buffer[0] - value) * (1.0f / sample);
+		kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+	}
+#  endif
+}
+
+#  if defined(__SPLIT_KERNEL__)
+#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+#  else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+	buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+	buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+	buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+#  endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+	kernel_write_pass_float3_unaligned(buffer, sample, value);
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+#  else
+	if(sample == 0) {
+		kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+	}
+	else {
+		float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+		float3 new_mean = sum * (1.0f / (sample + 1));
+		float3 old_mean = (sum - value) * (1.0f / sample);
+		kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+	}
+#  endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, float path_total, float path_total_shaded)
+{
+	if(kernel_data.film.pass_denoising_data == 0)
+		return;
+
+	buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+	path_total = ensure_finite(path_total);
+	path_total_shaded = ensure_finite(path_total_shaded);
+
+	kernel_write_pass_float(buffer, sample/2, path_total);
+	kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+	float value = path_total_shaded / max(path_total, 1e-7f);
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float(buffer+2, sample/2, value*value);
+#  else
+	if(sample < 2) {
+		kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+	}
+	else {
+		float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+		float new_value = buffer[1] / max(buffer[0], 1e-7f);
+		kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+	}
+#  endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        ccl_global PathState *state,
+                                                        PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight == 0.0f) {
+		return;
+	}
+
+	L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+	float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			continue;
+
+		/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+		normal += sc->N * sc->sample_weight;
+		sum_weight += sc->sample_weight;
+		if(!bsdf_is_specular_like(sc)) {
+			albedo += sc->weight;
+			sum_nonspecular_weight += sc->sample_weight;
+		}
+	}
+
+	/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+	if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+		if(sum_weight != 0.0f) {
+			normal /= sum_weight;
+		}
+		L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+		L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+		state->denoising_feature_weight = 0.0f;
+	}
+#else
+	(void) kg;
+	(void) sd;
+	(void) state;
+	(void) L;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
@@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
 #endif
 }
 
+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+	if(L) {
+		float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+		if(is_shadow_catcher) {
+			L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+		}
+		else
+#endif  /* __SHADOW_TRICKS__ */
+		{
+			L_sum = path_radiance_clamp_and_sum(kg, L);
+		}
+
+		kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+		kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+		if(kernel_data.film.pass_denoising_data) {
+#  ifdef __SHADOW_TRICKS__
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+#  else
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+#  endif
+			if(kernel_data.film.pass_denoising_clean) {
+				float3 noisy, clean;
+				path_radiance_split_denoising(kg, L, &noisy, &clean);
+				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+				                                  sample, noisy);
+				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+				                                   sample, clean);
+			}
+			else {
+				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+				                                  sample, L_sum);
+			}
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+			                                  sample, L->denoising_normal);
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+			                                  sample, L->denoising_albedo);
+			kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+			                                 sample, L->denoising_depth);
+		}
+#endif  /* __DENOISING_FEATURES__ */
+	}
+	else {
+		kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+		if(kernel_data.film.pass_denoising_data) {
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+			                                 sample, 0.0f);
+
+			if(kernel_data.film.pass_denoising_clean) {
+				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+				                                   sample, make_float3(0.0f, 0.0f, 0.0f));
+			}
+		}
+#endif  /* __DENOISING_FEATURES__ */
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 58da141aed3..0d31ae32aa6 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
 		}
 		else {
-			path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+			path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
 		}
 	}
 }
@@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 }
 
 
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
-                                               RNG *rng,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+                                              RNG *rng,
+                                              int sample,
+                                              Ray ray,
+                                              ccl_global float *buffer,
+                                              PathRadiance *L,
+                                              bool *is_shadow_catcher)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			float3 emission;
 
 			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
-				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __LAMP_MIS__ */
 
@@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 				/* emission */
 				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
 
 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &sd,
-						&emission_sd, throughput, &state, &L, all,
+						&emission_sd, throughput, &state, L, all,
 						&volume_ray, &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
@@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
@@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+					kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
 
 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
@@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, &state, throughput, L_background);
+			path_radiance_accum_background(L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __SHADOW_TRICKS__
 		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
 				state.catcher_object = sd.object;
 				if(!kernel_data.background.transparent) {
-					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+					L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
 				}
 			}
 		}
@@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
 
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
@@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_EMISSION) {
 			/* todo: is isect.t wrong here for transparent surfaces? */
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+			path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __EMISSION__ */
 
@@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+			kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif  /* __AO__ */
 
@@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
 			                                  &emission_sd,
-			                                  &L,
+			                                  L,
 			                                  &state,
 			                                  rng,
 			                                  &ray,
@@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 
 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
 
 		/* compute direct lighting and next bounce */
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 			break;
 	}
 
 #ifdef __SUBSURFACE__
-		kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+		kernel_path_subsurface_accum_indirect(&ss_indirect, L);
 
 		/* Trace indirect subsurface rays by restarting the loop. this uses less
 		 * stack memory than invoking kernel_path_indirect.
@@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			                                      &ss_indirect,
 			                                      &state,
 			                                      &ray,
-			                                      &L,
+			                                      L,
 			                                      &throughput);
 		}
 		else {
@@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 	}
 #endif  /* __SUBSURFACE__ */
 
-	float3 L_sum;
 #ifdef __SHADOW_TRICKS__
-	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
-		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
-	}
-	else
+	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
 #endif  /* __SHADOW_TRICKS__ */
-	{
-		L_sum = path_radiance_clamp_and_sum(kg, &L);
-	}
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif  /* __KERNEL_DEBUG__ */
 
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+	return 1.0f - L_transparent;
 }
 
 ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	PathRadiance L;
+	bool is_shadow_catcher;
 
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	if(ray.t != 0.0f) {
+		float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+	}
+	else {
+		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	}
 
 	path_rng_end(kg, rng_state, rng);
 }
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ddcb57161ea..10816d3e5d1 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 			light_ray.dD = differential3_zero();
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
 			}
 			else {
-				path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+				path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
 			}
 		}
 	}
@@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	for(int i = 0; i < sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];
 
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
 		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+		if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
 			continue;
+		}
 
 		int num_samples;
 
@@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			                                        &tp,
 			                                        &ps,
 			                                        L,
-			                                        &bsdf_ray))
+			                                        &bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
@@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */
 
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+                                                RNG *rng,
+                                                int sample,
+                                                Ray ray,
+                                                ccl_global float *buffer,
+                                                PathRadiance *L,
+                                                bool *is_shadow_catcher)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				int all = kernel_data.integrator.sample_all_lights_direct;
 
 				kernel_branched_path_volume_connect_light(kg, rng, &sd,
-					&emission_sd, throughput, &state, &L, all,
+					&emission_sd, throughput, &state, L, all,
 					&volume_ray, &volume_segment);
 
 				/* indirect light sampling */
@@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             &L,
+					                             L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
@@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						                     tp*num_samples_inv,
 						                     num_samples,
 						                     &ps,
-						                     &L);
+						                     L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+						path_radiance_sum_indirect(L);
+						path_radiance_reset_indirect(L);
 					}
 				}
 			}
 
 			/* emission and transmittance */
 			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+				path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
 			throughput *= volume_segment.accum_transmittance;
 
 			/* free cached steps */
@@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				path_state_branch(&ps, j, num_samples);
 
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+					kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
 
 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: support equiangular, MIS and all light sampling.
 					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
 
 					if(kernel_path_volume_bounce(kg,
 					                             rng,
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             &L,
+					                             L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
@@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						                     tp,
 						                     num_samples,
 						                     &ps,
-						                     &L);
+						                     L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+						path_radiance_sum_indirect(L);
+						path_radiance_reset_indirect(L);
 					}
 				}
 #endif  /* __VOLUME_SCATTER__ */
@@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, &state, throughput, L_background);
+			path_radiance_accum_background(L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __SHADOW_TRICKS__
 		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
 				state.catcher_object = sd.object;
 				if(!kernel_data.background.transparent) {
-					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+					L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
 				}
 			}
 		}
@@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
 
 #ifdef __EMISSION__
 		/* emission */
 		if(sd.flag & SD_EMISSION) {
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+			path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __EMISSION__ */
 
@@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
 		}
 #endif  /* __AO__ */
 
@@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        &L, &state, rng, &ray, throughput);
+			                                        L, &state, rng, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */
 
@@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				int all = (kernel_data.integrator.sample_all_lights_direct) ||
 				          (state.flag & PATH_RAY_SHADOW_CATCHER);
 				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+					&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
 			}
 #endif  /* __EMISSION__ */
 
 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
 
 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
@@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __VOLUME__ */
 	}
 
-	float3 L_sum;
 #ifdef __SHADOW_TRICKS__
-	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
-		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
-	}
-	else
+	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
 #endif  /* __SHADOW_TRICKS__ */
-	{
-		L_sum = path_radiance_clamp_and_sum(kg, &L);
-	}
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif  /* __KERNEL_DEBUG__ */
 
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+	return 1.0f - L_transparent;
 }
 
 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	PathRadiance L;
+	bool is_shadow_catcher;
 
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	if(ray.t != 0.0f) {
+		float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+	}
+	else {
+		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	}
 
 	path_rng_end(kg, rng_state, rng);
 }
@@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c0cd2a63120..0fa77d9e8bd 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 	state->transmission_bounce = 0;
 	state->transparent_bounce = 0;
 
+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+		state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+		state->denoising_feature_weight = 1.0f;
+	}
+	else {
+		state->denoising_feature_weight = 0.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	state->min_ray_pdf = FLT_MAX;
 	state->ray_pdf = 0.0f;
 #ifdef __LAMP_MIS__
@@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 
 	/* random number generator next bounce */
 	state->rng_offset += PRNG_BOUNCE_NUM;
+
+	if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+		state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+	}
 }
 
 ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index bd4ba775b4d..e676ea0f3ae 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 						else {
-							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 						else {
-							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
 				}
 				else {
-					path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+					path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
 				}
 			}
 		}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
         PathRadiance *L,
-        ccl_addr_space Ray *ray)
+        ccl_addr_space Ray *ray,
+        float sum_sample_weight)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
 	/* modify throughput */
 	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
 
+#ifdef __DENOISING_FEATURES__
+	state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
 	/* modify path state */
 	path_state_next(kg, state, label);
 
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 			else {
-				path_radiance_accum_total_light(L, throughput, &L_light);
+				path_radiance_accum_total_light(L, state, throughput, &L_light);
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 371f2c1c7cb..dcedf51e479 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 		}
 	}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
 				}
 			}
 		}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
 
 ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
+	if(is_zero(dir))
+		return make_float2(0.0f, 0.0f);
+
 	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
 	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
 
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 8c0c5e90a3e..c66f52255f0 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		
 		/* smooth normal */
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
 		sd->N = Ng;
 
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #  ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
 		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
 			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 9b354457b91..dd1fa1b82f7 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
 #define __PATCH_EVAL__
 #define __SHADOW_TRICKS__
 
+#define __DENOISING_FEATURES__
+
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
 #  define __EMISSION__
@@ -314,31 +316,32 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-	PATH_RAY_CAMERA = 1,
-	PATH_RAY_REFLECT = 2,
-	PATH_RAY_TRANSMIT = 4,
-	PATH_RAY_DIFFUSE = 8,
-	PATH_RAY_GLOSSY = 16,
-	PATH_RAY_SINGULAR = 32,
-	PATH_RAY_TRANSPARENT = 64,
-
-	PATH_RAY_SHADOW_OPAQUE = 128,
-	PATH_RAY_SHADOW_TRANSPARENT = 256,
+	PATH_RAY_CAMERA              = (1 << 0),
+	PATH_RAY_REFLECT             = (1 << 1),
+	PATH_RAY_TRANSMIT            = (1 << 2),
+	PATH_RAY_DIFFUSE             = (1 << 3),
+	PATH_RAY_GLOSSY              = (1 << 4),
+	PATH_RAY_SINGULAR            = (1 << 5),
+	PATH_RAY_TRANSPARENT         = (1 << 6),
+
+	PATH_RAY_SHADOW_OPAQUE       = (1 << 7),
+	PATH_RAY_SHADOW_TRANSPARENT  = (1 << 8),
 	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
 
-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
-	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+	PATH_RAY_CURVE               = (1 << 9), /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER      = (1 << 10), /* volume scattering */
 
 	/* Special flag to tag unaligned BVH nodes. */
-	PATH_RAY_NODE_UNALIGNED = 2048,
+	PATH_RAY_NODE_UNALIGNED = (1 << 11),
 
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+	PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
 
-	PATH_RAY_MIS_SKIP = 4096,
-	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
-	PATH_RAY_SHADOW_CATCHER = 32768,
-	PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+	PATH_RAY_MIS_SKIP            = (1 << 12),
+	PATH_RAY_DIFFUSE_ANCESTOR    = (1 << 13),
+	PATH_RAY_SINGLE_PASS_DONE    = (1 << 14),
+	PATH_RAY_SHADOW_CATCHER      = (1 << 15),
+	PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+	PATH_RAY_STORE_SHADOW_INFO   = (1 << 17),
 };
 
 /* Closure Label */
@@ -394,6 +397,22 @@ typedef enum PassType {
 
 #define PASS_ALL (~0)
 
+typedef enum DenoisingPassOffsets {
+	DENOISING_PASS_NORMAL             = 0,
+	DENOISING_PASS_NORMAL_VAR         = 3,
+	DENOISING_PASS_ALBEDO             = 6,
+	DENOISING_PASS_ALBEDO_VAR         = 9,
+	DENOISING_PASS_DEPTH              = 12,
+	DENOISING_PASS_DEPTH_VAR          = 13,
+	DENOISING_PASS_SHADOW_A           = 14,
+	DENOISING_PASS_SHADOW_B           = 17,
+	DENOISING_PASS_COLOR              = 20,
+	DENOISING_PASS_COLOR_VAR          = 23,
+
+	DENOISING_PASS_SIZE_BASE          = 26,
+	DENOISING_PASS_SIZE_CLEAN         = 3,
+} DenoisingPassOffsets;
+
 typedef enum BakePassFilter {
 	BAKE_FILTER_NONE = 0,
 	BAKE_FILTER_DIRECT = (1 << 0),
@@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
 	BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;
 
+typedef enum DenoiseFlag {
+	DENOISING_CLEAN_DIFFUSE_DIR      = (1 << 0),
+	DENOISING_CLEAN_DIFFUSE_IND      = (1 << 1),
+	DENOISING_CLEAN_GLOSSY_DIR       = (1 << 2),
+	DENOISING_CLEAN_GLOSSY_IND       = (1 << 3),
+	DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+	DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+	DENOISING_CLEAN_SUBSURFACE_DIR   = (1 << 6),
+	DENOISING_CLEAN_SUBSURFACE_IND   = (1 << 7),
+	DENOISING_CLEAN_ALL_PASSES       = (1 << 8)-1,
+} DenoiseFlag;
+
 typedef ccl_addr_space struct PathRadiance {
 #ifdef __PASSES__
 	int use_light_pass;
@@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
 	/* Color of the background on which shadow is alpha-overed. */
 	float3 shadow_color;
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	float3 denoising_normal;
+	float3 denoising_albedo;
+	float denoising_depth;
+#endif  /* __DENOISING_FEATURES__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
 #define SHADER_CLOSURE_BASE \
 	float3 weight; \
 	ClosureType type; \
-	float sample_weight \
+	float sample_weight; \
+	float3 N
 
 typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
 	SHADER_CLOSURE_BASE;
 
-	float data[14]; /* pad to 80 bytes */
+	float data[10]; /* pad to 80 bytes */
 } ShaderClosure;
 
 /* Shader Context
@@ -960,6 +998,10 @@ typedef struct PathState {
 	int transmission_bounce;
 	int transparent_bounce;
 
+#ifdef __DENOISING_FEATURES__
+	float denoising_feature_weight;
+#endif  /* __DENOISING_FEATURES__ */
+
 	/* multiple importance sampling */
 	float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
 	float ray_pdf;     /* last bounce pdf */
@@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
 	float mist_inv_depth;
 	float mist_falloff;
 
+	int pass_denoising_data;
+	int pass_denoising_clean;
+	int denoising_flags;
+	int pad;
+
 #ifdef __KERNEL_DEBUG__
 	int pass_bvh_traversed_nodes;
 	int pass_bvh_traversed_instances;
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..10007ee2635
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleV,
+                                                     float *sampleVV,
+                                                     float *bufferV,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean,
+                                                   float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weightImage,
+                                                           float *variance,
+                                                           float *differenceImage,
+                                                           int* rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+                                                float *outImage,
+                                                int* rect,
+                                                int w,
+                                                int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+                                                       float *outImage,
+                                                       int* rect,
+                                                       int w,
+                                                       int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *differenceImage,
+                                                         float *image,
+                                                         float *outImage,
+                                                         float *accumImage,
+                                                         int* rect,
+                                                         int w,
+                                                         int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *differenceImage,
+                                                             float *buffer,
+                                                             float *color_pass,
+                                                             float *variance_pass,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+                                                     float *accumImage,
+                                                     int* rect,
+                                                     int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..3b71e50ca3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleVariance,
+                                                     float *sampleVarianceV,
+                                                     float *bufferVariance,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+	kernel_filter_divide_shadow(sample, tiles,
+	                            x, y,
+	                            unfilteredA,
+	                            unfilteredB,
+	                            sampleVariance,
+	                            sampleVarianceV,
+	                            bufferVariance,
+	                            load_int4(prefilter_rect),
+	                            buffer_pass_stride,
+	                            buffer_denoising_offset,
+	                            use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean, float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+	kernel_filter_get_feature(sample, tiles,
+	                          m_offset, v_offset,
+	                          x, y,
+	                          mean, variance,
+	                          load_int4(prefilter_rect),
+	                          buffer_pass_stride,
+	                          buffer_denoising_offset,
+	                          use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* prefilter_rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+  rank += storage_ofs;
+  transform += storage_ofs*TRANSFORM_SIZE;
+	kernel_filter_construct_transform(buffer,
+	                                  x, y,
+	                                  load_int4(prefilter_rect),
+	                                  pass_stride,
+	                                  transform,
+	                                  rank,
+	                                  radius,
+	                                  pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weightImage,
+                                                           float *variance,
+                                                           float *differenceImage,
+                                                           int *rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+	kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+                                                float *outImage,
+                                                int *rect,
+                                                int w,
+                                                int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+	kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+                                                       float *outImage,
+                                                       int *rect,
+                                                       int w,
+                                                       int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+	kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *differenceImage,
+                                                         float *image,
+                                                         float *outImage,
+                                                         float *accumImage,
+                                                         int *rect,
+                                                         int w,
+                                                         int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+	kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *differenceImage,
+                                                             float *buffer,
+                                                             float *color_pass,
+                                                             float *variance_pass,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+    kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+                                                     float *accumImage,
+                                                     int *rect,
+                                                     int w)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+	kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+	XtWX += storage_ofs*XTWX_SIZE;
+	XtWY += storage_ofs*XTWY_SIZE;
+	rank += storage_ofs;
+	kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 39c9a9cf33c..9895080d328 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
 
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 8c05dd1d9ef..b9d82781840 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -57,6 +57,11 @@
 #  include "kernel/split/kernel_buffer_update.h"
 #endif
 
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            int offset,
                                            int stride)
 {
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+#  ifdef __BRANCHED_PATH__
 	if(kernel_data.integrator.branched) {
 		kernel_branched_path_trace(kg,
 		                           buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 		                           stride);
 	}
 	else
-#endif
+#  endif
 	{
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 	}
+#endif /* KERNEL_STUB */
 }
 
 /* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
                                                 int offset,
                                                 int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
 	kernel_film_convert_to_byte(kg,
 	                            rgba,
 	                            buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
+#endif /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
                                                       int offset,
                                                       int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
 	kernel_film_convert_to_half_float(kg,
 	                                  rgba,
 	                                  buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
+#endif /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, shader);
+#else
 	if(type >= SHADER_EVAL_BAKE) {
 		kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+#  ifdef __BAKING__
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                     i,
 		                     offset,
 		                     sample);
-#endif
+#  endif
 	}
 	else {
 		kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                       i,
 		                       sample);
 	}
+#endif /* KERNEL_STUB */
 }
 
 #else  /* __SPLIT_KERNEL__ */
 
 /* Split Kernel Path Tracing */
 
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+#else
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		kernel_##name(kg); \
 	}
+#endif /* KERNEL_STUB */
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
-	REGISTER(path_trace);
-	REGISTER(convert_to_byte);
-	REGISTER(convert_to_half_float);
-	REGISTER(shader);
-
-	REGISTER(data_init);
-	REGISTER(path_init);
-	REGISTER(scene_intersect);
-	REGISTER(lamp_emission);
-	REGISTER(do_volume);
-	REGISTER(queue_enqueue);
-	REGISTER(indirect_background);
-	REGISTER(shader_setup);
-	REGISTER(shader_sort);
-	REGISTER(shader_eval);
-	REGISTER(holdout_emission_blurring_pathtermination_ao);
-	REGISTER(subsurface_scatter);
-	REGISTER(direct_lighting);
-	REGISTER(shadow_blocked_ao);
-	REGISTER(shadow_blocked_dl);
-	REGISTER(next_iteration_setup);
-	REGISTER(indirect_subsurface);
-	REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
 #endif  /* __SPLIT_KERNEL__ */
 
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..50f73f9728d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+                                 TilesInfo *tiles,
+                                 float *unfilteredA,
+                                 float *unfilteredB,
+                                 float *sampleVariance,
+                                 float *sampleVarianceV,
+                                 float *bufferVariance,
+                                 int4 prefilter_rect,
+                                 int buffer_pass_stride,
+                                 int buffer_denoising_offset,
+                                 bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+                               TilesInfo *tiles,
+                               int m_offset,
+                               int v_offset,
+                               float *mean,
+                               float *variance,
+                               int4 prefilter_rect,
+                               int buffer_pass_stride,
+                               int buffer_denoising_offset,
+                               bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       float *transform, int *rank,
+                                       int4 filter_area, int4 rect,
+                                       int radius, float pca_threshold,
+                                       int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int *l_rank = rank + y*filter_area.z + x;
+		float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+                                       float ccl_restrict_ptr weightImage,
+                                       float ccl_restrict_ptr varianceImage,
+                                       float *differenceImage,
+                                       int4 rect, int w,
+                                       int channel_offset,
+                                       float a, float k_2) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+                                     float ccl_restrict_ptr differenceImage,
+                                     float ccl_restrict_ptr image,
+                                     float *outImage, float *accumImage,
+                                     int4 rect, int w,
+                                     int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+                                         float ccl_restrict_ptr differenceImage,
+                                         float ccl_restrict_ptr buffer,
+                                         float *color_pass,
+                                         float *variance_pass,
+                                         float const* __restrict__ transform,
+                                         int *rank,
+                                         float *XtWX,
+                                         float3 *XtWY,
+                                         int4 rect,
+                                         int4 filter_rect,
+                                         int w, int h, int f,
+                                         int pass_stride) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    differenceImage,
+		                                    buffer,
+		                                    color_pass, variance_pass,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+                            float *buffer, int *rank,
+                            float *XtWX, float3 *XtWY,
+                            int4 filter_area, int4 buffer_params,
+                            int sample) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..3d82bff9892
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+                                              ccl_global TilesInfo *tiles,
+                                              ccl_global float *unfilteredA,
+                                              ccl_global float *unfilteredB,
+                                              ccl_global float *sampleVariance,
+                                              ccl_global float *sampleVarianceV,
+                                              ccl_global float *bufferVariance,
+                                              int4 prefilter_rect,
+                                              int buffer_pass_stride,
+                                              int buffer_denoising_offset,
+                                              char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int m_offset,
+                                            int v_offset,
+                                            ccl_global float *mean,
+                                            ccl_global float *variance,
+                                            int4 prefilter_rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+                                               ccl_global float *variance,
+                                               ccl_global float *a,
+                                               ccl_global float *b,
+                                               int4 prefilter_rect,
+                                               int r)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+                                                    ccl_global float *transform,
+                                                    ccl_global int *rank,
+                                                    int4 filter_area,
+                                                    int4 rect,
+                                                    int pass_stride,
+                                                    int radius,
+                                                    float pca_threshold)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		ccl_global int *l_rank = rank + y*filter_area.z + x;
+		ccl_global float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+                                                    int dy,
+                                                    ccl_global float ccl_restrict_ptr weightImage,
+                                                    ccl_global float ccl_restrict_ptr varianceImage,
+                                                    ccl_global float *differenceImage,
+                                                    int4 rect,
+                                                    int w,
+                                                    int channel_offset,
+                                                    float a,
+                                                    float k_2) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
+                                         ccl_global float *outImage,
+                                         int4 rect,
+                                         int w,
+                                         int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
+                                                ccl_global float *outImage,
+                                                int4 rect,
+                                                int w,
+                                                int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+                                                  int dy,
+                                                  ccl_global float ccl_restrict_ptr differenceImage,
+                                                  ccl_global float ccl_restrict_ptr image,
+                                                  ccl_global float *outImage,
+                                                  ccl_global float *accumImage,
+                                                  int4 rect,
+                                                  int w,
+                                                  int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
+                                              ccl_global float ccl_restrict_ptr accumImage,
+                                              int4 rect,
+                                              int w) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+                                                      int dy,
+                                                      ccl_global float ccl_restrict_ptr differenceImage,
+                                                      ccl_global float ccl_restrict_ptr buffer,
+                                                      ccl_global float *color_pass,
+                                                      ccl_global float *variance_pass,
+                                                      ccl_global float ccl_restrict_ptr transform,
+                                                      ccl_global int *rank,
+                                                      ccl_global float *XtWX,
+                                                      ccl_global float3 *XtWY,
+                                                      int4 rect,
+                                                      int4 filter_rect,
+                                                      int w,
+                                                      int h,
+                                                      int f,
+                                                      int pass_stride) {
+	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    differenceImage,
+		                                    buffer,
+		                                    color_pass, variance_pass,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+	                                     int h,
+                                         ccl_global float *buffer,
+                                         ccl_global int *rank,
+                                         ccl_global float *XtWX,
+                                         ccl_global float3 *XtWY,
+                                         int4 filter_area,
+                                         int4 buffer_params,
+                                         int sample) {
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+                                          ccl_global float *buffer_1,
+                                          ccl_global float *buffer_2,
+                                          ccl_global float *buffer_3,
+                                          ccl_global float *buffer_4,
+                                          ccl_global float *buffer_5,
+                                          ccl_global float *buffer_6,
+                                          ccl_global float *buffer_7,
+                                          ccl_global float *buffer_8,
+                                          ccl_global float *buffer_9)
+{
+	if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+		tiles->buffers[0] = buffer_1;
+		tiles->buffers[1] = buffer_2;
+		tiles->buffers[2] = buffer_3;
+		tiles->buffers[3] = buffer_4;
+		tiles->buffers[4] = buffer_5;
+		tiles->buffers[5] = buffer_6;
+		tiles->buffers[6] = buffer_7;
+		tiles->buffers[7] = buffer_8;
+		tiles->buffers[8] = buffer_9;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index c7bc1b4df0a..dc74a2ada53 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	float3 throughput = branched_state->throughput;
+	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(ps->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
 
 	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];
@@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 		RNG bsdf_rng = cmj_hash(rng, i);
 
 		for(int j = branched_state->next_sample; j < num_samples; j++) {
-			ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
 			if(reset_path_state) {
 				*ps = branched_state->path_state;
 			}
@@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 			                                        tp,
 			                                        ps,
 			                                        L,
-			                                        bsdf_ray))
+			                                        bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 859c221d976..1f6dce0253c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum;
-#ifdef __SHADOW_TRICKS__
-		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
-			L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
-		}
-		else
-#endif  /* __SHADOW_TRICKS__ */
-		{
-			L_sum = path_radiance_clamp_and_sum(kg, L);
-		}
 		kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
 		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
 
 		/* accumulate result in output buffer */
-		kernel_write_pass_float4(buffer, sample, L_rad);
+		bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+		kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
 		path_rng_end(kg, rng_state, rng);
 
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 87498910d38..670a557f084 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 #ifdef __SHADOW_TRICKS__
 		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state->flag & PATH_RAY_CAMERA) {
-				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
 				state->catcher_object = sd->object;
 				if(!kernel_data.background.transparent) {
 					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 					kernel_split_state.throughput[ray_index] = throughput/probability;
 				}
 			}
+
+			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}
 
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 452b6e45a36..386fbbc4d09 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 			               &shadow))
 		{
 			/* accumulate */
-			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 		}
 		else {
-			path_radiance_accum_total_light(L, throughput, &L_light);
+			path_radiance_accum_total_light(L, state, throughput, &L_light);
 		}
 	}
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 407f8e784c0..7918c640175 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 			if(bsdf) {
+				bsdf->N = N;
 				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
@@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					/* todo: giving a fixed weight here will cause issues when
 					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index c94fa130af7..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	strength = max(strength, 0.0f);
 
 	/* compute and output perturbed normal */
-	float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
-	normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+	if(is_zero(normal_out)) {
+		normal_out = normal_in;
+	}
+	else {
+		normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	}
 
 	if(use_object_space) {
 		object_normal_transform(kg, sd, &normal_out);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 4a09d9f6653..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 #ifdef __UV__
 		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
+		default: data = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	stack_store_float3(stack, out_offset, data);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 328ff79223b..8e45dbfa5ff 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
 	float3 co = stack_load_float3(stack, co_offset);
 	float2 uv;
 
-	co = normalize(co);
-	
+	co = safe_normalize(co);
+
 	if(projection == 0)
 		uv = direction_to_equirectangular(co);
 	else
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index cc9b840b13f..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -402,7 +402,6 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
 	/* Glossy */
-	CLOSURE_BSDF_GLOSSY_ID,
 	CLOSURE_BSDF_REFLECTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
@@ -423,14 +422,13 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
 
 	/* Transmission */
-	CLOSURE_BSDF_TRANSMISSION_ID,
 	CLOSURE_BSDF_TRANSLUCENT_ID,
 	CLOSURE_BSDF_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
-	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
 	CLOSURE_BSDF_SHARP_GLASS_ID,
 	CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
@@ -465,13 +463,16 @@ typedef enum ClosureType {
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
 #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
                                             type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
                                             type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+                                          (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -480,7 +481,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
 #define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fe2c2e78926..cf402c3f214 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,9 @@ BufferParams::BufferParams()
 	full_width = 0;
 	full_height = 0;
 
+	denoising_data_pass = false;
+	denoising_clean_pass = false;
+
 	Pass::add(PASS_COMBINED, passes);
 }
 
@@ -68,10 +71,25 @@ int BufferParams::get_passes_size()
 
 	for(size_t i = 0; i < passes.size(); i++)
 		size += passes[i].components;
-	
+
+	if(denoising_data_pass) {
+		size += DENOISING_PASS_SIZE_BASE;
+		if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+	}
+
 	return align_up(size, 4);
 }
 
+int BufferParams::get_denoising_offset()
+{
+	int offset = 0;
+
+	for(size_t i = 0; i < passes.size(); i++)
+		offset += passes[i].components;
+
+	return offset;
+}
+
 /* Render Buffer Task */
 
 RenderTile::RenderTile()
@@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
 }
 
-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::copy_from_device(Device *from_device)
 {
 	if(!buffer.device_pointer)
 		return false;
 
-	device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+	if(!from_device) {
+		from_device = device;
+	}
+
+	from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+
+	return true;
+}
+
+bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
+{
+	float scale = 1.0f/sample;
+
+	if(offset == DENOISING_PASS_COLOR) {
+		scale *= exposure;
+	}
+	else if(offset == DENOISING_PASS_COLOR_VAR) {
+		scale *= exposure*exposure;
+	}
+
+	offset += params.get_denoising_offset();
+	float *in = (float*)buffer.data_pointer + offset;
+	int pass_stride = params.get_passes_size();
+	int size = params.width*params.height;
+
+	if(components == 1) {
+		for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+			pixels[0] = in[0]*scale;
+		}
+	}
+	else if(components == 3) {
+		for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+			pixels[0] = in[0]*scale;
+			pixels[1] = in[1]*scale;
+			pixels[2] = in[2]*scale;
+		}
+	}
+	else {
+		return false;
+	}
 
 	return true;
 }
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 5c78971678a..e56556c8abe 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -51,6 +51,9 @@ public:
 
 	/* passes */
 	array<Pass> passes;
+	bool denoising_data_pass;
+	/* If only some light path types should be denoised, an additional pass is needed. */
+	bool denoising_clean_pass;
 
 	/* functions */
 	BufferParams();
@@ -59,6 +62,7 @@ public:
 	bool modified(const BufferParams& params);
 	void add_pass(PassType type);
 	int get_passes_size();
+	int get_denoising_offset();
 };
 
 /* Render Buffers */
@@ -73,18 +77,19 @@ public:
 	/* random number generator state */
 	device_vector<uint> rng_state;
 
+	Device *device;
+
 	explicit RenderBuffers(Device *device);
 	~RenderBuffers();
 
 	void reset(Device *device, BufferParams& params);
 
-	bool copy_from_device();
+	bool copy_from_device(Device *from_device = NULL);
 	bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
+	bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
 
 protected:
 	void device_free();
-
-	Device *device;
 };
 
 /* Display Buffer
@@ -131,6 +136,9 @@ protected:
 
 class RenderTile {
 public:
+	typedef enum { PATH_TRACE, DENOISE } Task;
+
+	Task task;
 	int x, y, w, h;
 	int start_sample;
 	int num_samples;
@@ -138,6 +146,7 @@ public:
 	int resolution;
 	int offset;
 	int stride;
+	int tile_index;
 
 	device_ptr buffer;
 	device_ptr rng_state;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 7809f4345f1..c8213d258d5 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -279,6 +279,10 @@ NODE_DEFINE(Film)
 
 	SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
 
+	SOCKET_BOOLEAN(denoising_data_pass,  "Generate Denoising Data Pass",  false);
+	SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+	SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+
 	return type;
 }
 
@@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 		kfilm->pass_stride += pass.components;
 	}
 
+	kfilm->pass_denoising_data = 0;
+	kfilm->pass_denoising_clean = 0;
+	kfilm->denoising_flags = 0;
+	if(denoising_data_pass) {
+		kfilm->pass_denoising_data = kfilm->pass_stride;
+		kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
+		kfilm->denoising_flags = denoising_flags;
+		if(denoising_clean_pass) {
+			kfilm->pass_denoising_clean = kfilm->pass_stride;
+			kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
+			kfilm->use_light_pass = 1;
+		}
+	}
+
 	kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
 	kfilm->pass_alpha_threshold = pass_alpha_threshold;
 
@@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
 	kfilm->mist_falloff = mist_falloff;
 
+	pass_stride = kfilm->pass_stride;
+	denoising_data_offset = kfilm->pass_denoising_data;
+	denoising_clean_offset = kfilm->pass_denoising_clean;
+
 	need_update = false;
 }
 
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 83c941d5c57..29b1e7e9157 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -57,8 +57,15 @@ public:
 
 	float exposure;
 	array<Pass> passes;
+	bool denoising_data_pass;
+	bool denoising_clean_pass;
+	int denoising_flags;
 	float pass_alpha_threshold;
 
+	int pass_stride;
+	int denoising_data_offset;
+	int denoising_clean_offset;
+
 	FilterType filter_type;
 	float filter_width;
 	size_t filter_table_offset;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 33d1936659b..03825f780e0 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -903,7 +903,7 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
 		float3 vNi = vN[i];
 
 		if(do_transform)
-			vNi = normalize(transform_direction(&ntfm, vNi));
+			vNi = safe_normalize(transform_direction(&ntfm, vNi));
 
 		vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
 	}
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c9b5547b407..3eaf34c847f 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -114,8 +114,9 @@ Session::~Session()
 	}
 
 	/* clean up */
-	foreach(RenderBuffers *buffers, tile_buffers)
-		delete buffers;
+	foreach(RenderTile &rtile, render_tiles)
+		delete rtile.buffers;
+	tile_manager.free_device();
 
 	delete buffers;
 	delete display;
@@ -268,8 +269,8 @@ void Session::run_gpu()
 			/* update status and timing */
 			update_status_time();
 
-			/* path trace */
-			path_trace();
+			/* render */
+			render();
 
 			device->task_wait();
 
@@ -358,20 +359,22 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 	thread_scoped_lock tile_lock(tile_mutex);
 
 	/* get next tile from manager */
-	Tile tile;
+	Tile *tile;
 	int device_num = device->device_number(tile_device);
 
 	if(!tile_manager.next_tile(tile, device_num))
 		return false;
 	
 	/* fill render tile */
-	rtile.x = tile_manager.state.buffer.full_x + tile.x;
-	rtile.y = tile_manager.state.buffer.full_y + tile.y;
-	rtile.w = tile.w;
-	rtile.h = tile.h;
+	rtile.x = tile_manager.state.buffer.full_x + tile->x;
+	rtile.y = tile_manager.state.buffer.full_y + tile->y;
+	rtile.w = tile->w;
+	rtile.h = tile->h;
 	rtile.start_sample = tile_manager.state.sample;
 	rtile.num_samples = tile_manager.state.num_samples;
 	rtile.resolution = tile_manager.state.resolution_divider;
+	rtile.tile_index = tile->index;
+	rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE;
 
 	tile_lock.unlock();
 
@@ -383,54 +386,70 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 		rtile.buffer = buffers->buffer.device_pointer;
 		rtile.rng_state = buffers->rng_state.device_pointer;
 		rtile.buffers = buffers;
+		tile->buffers = buffers;
 
 		device->map_tile(tile_device, rtile);
 
 		return true;
 	}
 
-	/* fill buffer parameters */
-	BufferParams buffer_params = tile_manager.params;
-	buffer_params.full_x = rtile.x;
-	buffer_params.full_y = rtile.y;
-	buffer_params.width = rtile.w;
-	buffer_params.height = rtile.h;
-
-	buffer_params.get_offset_stride(rtile.offset, rtile.stride);
-
-	RenderBuffers *tilebuffers;
+	bool store_rtile = false;
+	if(tile->buffers == NULL) {
+		/* fill buffer parameters */
+		BufferParams buffer_params = tile_manager.params;
+		buffer_params.full_x = rtile.x;
+		buffer_params.full_y = rtile.y;
+		buffer_params.width = rtile.w;
+		buffer_params.height = rtile.h;
+
+		/* allocate buffers */
+		if(params.progressive_refine) {
+			tile_lock.lock();
+
+			if(render_tiles.size() == 0) {
+				RenderTile nulltile;
+				nulltile.buffers = NULL;
+				render_tiles.resize(tile_manager.state.num_tiles, nulltile);
+			}
 
-	/* allocate buffers */
-	if(params.progressive_refine) {
-		tile_lock.lock();
+			/* In certain circumstances number of tiles in the tile manager could
+			 * be changed. This is not supported by the progressive refine feature.
+			 */
+			assert(render_tiles.size() == tile_manager.state.num_tiles);
 
-		if(tile_buffers.size() == 0)
-			tile_buffers.resize(tile_manager.state.num_tiles, NULL);
+			RenderTile &stored_rtile = render_tiles[tile->index];
+			if(stored_rtile.buffers == NULL) {
+				tile->buffers = new RenderBuffers(tile_device);
+				tile->buffers->reset(tile_device, buffer_params);
+				store_rtile = true;
+			}
+			else {
+				assert(rtile.x == stored_rtile.x &&
+				       rtile.y == stored_rtile.y &&
+				       rtile.w == stored_rtile.w &&
+				       rtile.h == stored_rtile.h);
+				tile_lock.unlock();
+				tile->buffers = stored_rtile.buffers;
+			}
+		}
+		else {
+			tile->buffers = new RenderBuffers(tile_device);
 
-		/* In certain circumstances number of tiles in the tile manager could
-		 * be changed. This is not supported by the progressive refine feature.
-		 */
-		assert(tile_buffers.size() == tile_manager.state.num_tiles);
+			tile->buffers->reset(tile_device, buffer_params);
+		}
+	}
 
-		tilebuffers = tile_buffers[tile.index];
-		if(tilebuffers == NULL) {
-			tilebuffers = new RenderBuffers(tile_device);
-			tile_buffers[tile.index] = tilebuffers;
+	tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
 
-			tilebuffers->reset(tile_device, buffer_params);
-		}
+	rtile.buffer = tile->buffers->buffer.device_pointer;
+	rtile.rng_state = tile->buffers->rng_state.device_pointer;
+	rtile.buffers = tile->buffers;
+	rtile.sample = 0;
 
+	if(store_rtile) {
+		render_tiles[tile->index] = rtile;
 		tile_lock.unlock();
 	}
-	else {
-		tilebuffers = new RenderBuffers(tile_device);
-
-		tilebuffers->reset(tile_device, buffer_params);
-	}
-
-	rtile.buffer = tilebuffers->buffer.device_pointer;
-	rtile.rng_state = tilebuffers->rng_state.device_pointer;
-	rtile.buffers = tilebuffers;
 
 	/* this will tag tile as IN PROGRESS in blender-side render pipeline,
 	 * which is needed to highlight currently rendering tile before first
@@ -449,7 +468,7 @@ void Session::update_tile_sample(RenderTile& rtile)
 		if(params.progressive_refine == false) {
 			/* todo: optimize this by making it thread safe and removing lock */
 
-			update_render_tile_cb(rtile);
+			update_render_tile_cb(rtile, true);
 		}
 	}
 
@@ -462,18 +481,75 @@ void Session::release_tile(RenderTile& rtile)
 
 	progress.add_finished_tile();
 
-	if(write_render_tile_cb) {
-		if(params.progressive_refine == false) {
-			/* todo: optimize this by making it thread safe and removing lock */
-			write_render_tile_cb(rtile);
+	bool delete_tile;
 
-			delete rtile.buffers;
+	if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) {
+		if(write_render_tile_cb && params.progressive_refine == false) {
+			write_render_tile_cb(rtile);
+			if(delete_tile) {
+				delete rtile.buffers;
+				tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
+			}
+		}
+	}
+	else {
+		if(update_render_tile_cb && params.progressive_refine == false) {
+			update_render_tile_cb(rtile, false);
 		}
 	}
 
 	update_status_time();
 }
 
+void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+	thread_scoped_lock tile_lock(tile_mutex);
+
+	int center_idx = tiles[4].tile_index;
+	assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+	BufferParams buffer_params = tile_manager.params;
+	int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y,
+	                              buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height);
+
+	for(int dy = -1, i = 0; dy <= 1; dy++) {
+		for(int dx = -1; dx <= 1; dx++, i++) {
+			int px = tiles[4].x + dx*params.tile_size.x;
+			int py = tiles[4].y + dy*params.tile_size.y;
+			if(px >= image_region.x && py >= image_region.y &&
+			   px <  image_region.z && py <  image_region.w) {
+				int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx;
+				Tile *tile = &tile_manager.state.tiles[tile_index];
+				assert(tile->buffers);
+
+				tiles[i].buffer = tile->buffers->buffer.device_pointer;
+				tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
+				tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
+				tiles[i].w = tile->w;
+				tiles[i].h = tile->h;
+				tiles[i].buffers = tile->buffers;
+
+				tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
+			}
+			else {
+				tiles[i].buffer = (device_ptr)NULL;
+				tiles[i].buffers = NULL;
+				tiles[i].x = clamp(px, image_region.x, image_region.z);
+				tiles[i].y = clamp(py, image_region.y, image_region.w);
+				tiles[i].w = tiles[i].h = 0;
+			}
+		}
+	}
+
+	assert(tiles[4].buffers);
+	device->map_neighbor_tiles(tile_device, tiles);
+}
+
+void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+{
+	thread_scoped_lock tile_lock(tile_mutex);
+	device->unmap_neighbor_tiles(tile_device, tiles);
+}
+
 void Session::run_cpu()
 {
 	bool tiles_written = false;
@@ -558,8 +634,8 @@ void Session::run_cpu()
 			/* update status and timing */
 			update_status_time();
 
-			/* path trace */
-			path_trace();
+			/* render */
+			render();
 
 			/* update status and timing */
 			update_status_time();
@@ -744,10 +820,10 @@ void Session::reset(BufferParams& buffer_params, int samples)
 	if(params.progressive_refine) {
 		thread_scoped_lock buffers_lock(buffers_mutex);
 
-		foreach(RenderBuffers *buffers, tile_buffers)
-			delete buffers;
+		foreach(RenderTile &rtile, render_tiles)
+			delete rtile.buffers;
 
-		tile_buffers.clear();
+		render_tiles.clear();
 	}
 }
 
@@ -882,13 +958,15 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	progress.set_status(status, substatus);
 }
 
-void Session::path_trace()
+void Session::render()
 {
 	/* add path trace task */
-	DeviceTask task(DeviceTask::PATH_TRACE);
+	DeviceTask task(DeviceTask::RENDER);
 	
 	task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
 	task.release_tile = function_bind(&Session::release_tile, this, _1);
+	task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
+	task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
 	task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
 	task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
 	task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
@@ -897,6 +975,18 @@ void Session::path_trace()
 	task.requested_tile_size = params.tile_size;
 	task.passes_size = tile_manager.params.get_passes_size();
 
+	if(params.use_denoising) {
+		task.denoising_radius = params.denoising_radius;
+		task.denoising_strength = params.denoising_strength;
+		task.denoising_feature_strength = params.denoising_feature_strength;
+		task.denoising_relative_pca = params.denoising_relative_pca;
+
+		assert(!scene->film->need_update);
+		task.pass_stride = scene->film->pass_stride;
+		task.pass_denoising_data = scene->film->denoising_data_offset;
+		task.pass_denoising_clean = scene->film->denoising_clean_offset;
+	}
+
 	device->task_add(task);
 }
 
@@ -940,9 +1030,7 @@ bool Session::update_progressive_refine(bool cancel)
 	}
 
 	if(params.progressive_refine) {
-		foreach(RenderBuffers *buffers, tile_buffers) {
-			RenderTile rtile;
-			rtile.buffers = buffers;
+		foreach(RenderTile &rtile, render_tiles) {
 			rtile.sample = sample;
 
 			if(write) {
@@ -951,7 +1039,7 @@ bool Session::update_progressive_refine(bool cancel)
 			}
 			else {
 				if(update_render_tile_cb)
-					update_render_tile_cb(rtile);
+					update_render_tile_cb(rtile, true);
 			}
 		}
 	}
@@ -965,10 +1053,11 @@ void Session::device_free()
 {
 	scene->device_free();
 
-	foreach(RenderBuffers *buffers, tile_buffers)
-		delete buffers;
+	foreach(RenderTile &tile, render_tiles)
+		delete tile.buffers;
+	tile_manager.free_device();
 
-	tile_buffers.clear();
+	render_tiles.clear();
 
 	/* used from background render only, so no need to
 	 * re-create render/display buffers here
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index a7e5f78a64d..a7ca90abbce 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -57,6 +57,12 @@ public:
 
 	bool display_buffer_linear;
 
+	bool use_denoising;
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+
 	double cancel_timeout;
 	double reset_timeout;
 	double text_timeout;
@@ -77,6 +83,12 @@ public:
 		start_resolution = INT_MAX;
 		threads = 0;
 
+		use_denoising = false;
+		denoising_radius = 8;
+		denoising_strength = 0.0f;
+		denoising_feature_strength = 0.0f;
+		denoising_relative_pca = false;
+
 		display_buffer_linear = false;
 
 		cancel_timeout = 0.1;
@@ -99,6 +111,11 @@ public:
 		&& tile_size == params.tile_size
 		&& start_resolution == params.start_resolution
 		&& threads == params.threads
+		&& use_denoising == params.use_denoising
+		&& denoising_radius == params.denoising_radius
+		&& denoising_strength == params.denoising_strength
+		&& denoising_feature_strength == params.denoising_feature_strength
+		&& denoising_relative_pca == params.denoising_relative_pca
 		&& display_buffer_linear == params.display_buffer_linear
 		&& cancel_timeout == params.cancel_timeout
 		&& reset_timeout == params.reset_timeout
@@ -126,7 +143,7 @@ public:
 	Stats stats;
 
 	function<void(RenderTile&)> write_render_tile_cb;
-	function<void(RenderTile&)> update_render_tile_cb;
+	function<void(RenderTile&, bool)> update_render_tile_cb;
 
 	explicit Session(const SessionParams& params);
 	~Session();
@@ -162,7 +179,7 @@ protected:
 	void update_status_time(bool show_pause = false, bool show_done = false);
 
 	void tonemap(int sample);
-	void path_trace();
+	void render();
 	void reset_(BufferParams& params, int samples);
 
 	void run_cpu();
@@ -177,6 +194,9 @@ protected:
 	void update_tile_sample(RenderTile& tile);
 	void release_tile(RenderTile& tile);
 
+	void map_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+	void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+
 	bool device_use_gl;
 
 	thread *session_thread;
@@ -202,7 +222,7 @@ protected:
 	double last_update_time;
 	bool update_progressive_refine(bool cancel);
 
-	vector<RenderBuffers *> tile_buffers;
+	vector<RenderTile> render_tiles;
 
 	DeviceRequestedFeatures get_requested_device_features();
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 944e746ca2d..176a1f4f0f3 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -25,37 +25,39 @@ namespace {
 
 class TileComparator {
 public:
-	TileComparator(TileOrder order, int2 center)
-	 :  order_(order),
-	    center_(center)
+	TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
+	 :  order(order_),
+	    center(center_),
+	    tiles(tiles_)
 	{}
 
-	bool operator()(Tile &a, Tile &b)
+	bool operator()(int a, int b)
 	{
-		switch(order_) {
+		switch(order) {
 			case TILE_CENTER:
 			{
-				float2 dist_a = make_float2(center_.x - (a.x + a.w/2),
-				                            center_.y - (a.y + a.h/2));
-				float2 dist_b = make_float2(center_.x - (b.x + b.w/2),
-				                            center_.y - (b.y + b.h/2));
+				float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2),
+				                            center.y - (tiles[a].y + tiles[a].h/2));
+				float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2),
+				                            center.y - (tiles[b].y + tiles[b].h/2));
 				return dot(dist_a, dist_a) < dot(dist_b, dist_b);
 			}
 			case TILE_LEFT_TO_RIGHT:
-				return (a.x == b.x)? (a.y < b.y): (a.x < b.x);
+				return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x);
 			case TILE_RIGHT_TO_LEFT:
-				return (a.x == b.x)? (a.y < b.y): (a.x > b.x);
+				return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x);
 			case TILE_TOP_TO_BOTTOM:
-				return (a.y == b.y)? (a.x < b.x): (a.y > b.y);
+				return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y);
 			case TILE_BOTTOM_TO_TOP:
 			default:
-				return (a.y == b.y)? (a.x < b.x): (a.y < b.y);
+				return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y);
 		}
 	}
 
 protected:
-	TileOrder order_;
-	int2 center_;
+	TileOrder order;
+	int2 center;
+	Tile *tiles;
 };
 
 inline int2 hilbert_index_to_pos(int n, int d)
@@ -96,6 +98,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i
 	num_devices = num_devices_;
 	preserve_tile_device = preserve_tile_device_;
 	background = background_;
+	schedule_denoising = false;
 
 	range_start_sample = 0;
 	range_num_samples = -1;
@@ -108,6 +111,16 @@ TileManager::~TileManager()
 {
 }
 
+void TileManager::free_device()
+{
+	if(schedule_denoising) {
+		for(int i = 0; i < state.tiles.size(); i++) {
+			delete state.tiles[i].buffers;
+			state.tiles[i].buffers = NULL;
+		}
+	}
+}
+
 static int get_divider(int w, int h, int start_resolution)
 {
 	int divider = 1;
@@ -133,6 +146,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 	state.num_tiles = 0;
 	state.num_samples = 0;
 	state.resolution_divider = get_divider(params.width, params.height, start_resolution);
+	state.render_tiles.clear();
+	state.denoising_tiles.clear();
 	state.tiles.clear();
 }
 
@@ -157,6 +172,9 @@ void TileManager::set_samples(int num_samples_)
 		}
 
 		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+		if(schedule_denoising) {
+			state.total_pixel_samples += params.width*params.height;
+		}
 	}
 }
 
@@ -169,32 +187,36 @@ int TileManager::gen_tiles(bool sliced)
 	int image_h = max(1, params.height/resolution);
 	int2 center = make_int2(image_w/2, image_h/2);
 
-	state.tiles.clear();
-
 	int num_logical_devices = preserve_tile_device? num_devices: 1;
 	int num = min(image_h, num_logical_devices);
 	int slice_num = sliced? num: 1;
-	int tile_index = 0;
+	int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
 
 	state.tiles.clear();
-	state.tiles.resize(num);
-	vector<list<Tile> >::iterator tile_list = state.tiles.begin();
+	state.render_tiles.clear();
+	state.denoising_tiles.clear();
+	state.render_tiles.resize(num);
+	state.denoising_tiles.resize(num);
+	state.tile_stride = tile_w;
+	vector<list<int> >::iterator tile_list;
+	tile_list = state.render_tiles.begin();
 
 	if(tile_order == TILE_HILBERT_SPIRAL) {
 		assert(!sliced);
 
+		int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+		state.tiles.resize(tile_w*tile_h);
+
 		/* Size of blocks in tiles, must be a power of 2 */
 		const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4;
 
-		int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
-		int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y;
-		int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+		int tiles_per_device = divide_up(tile_w * tile_h, num);
 		int cur_device = 0, cur_tiles = 0;
 
 		int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
 		/* Number of blocks to fill the image */
-		int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x;
-		int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y;
+		int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x);
+		int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y);
 		int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
 		/* Offset of spiral (to keep it centered) */
 		int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2);
@@ -225,9 +247,11 @@ int TileManager::gen_tiles(bool sliced)
 				if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
 					int w = min(tile_size.x, image_w - pos.x);
 					int h = min(tile_size.y, image_h - pos.y);
-					tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device));
+					int2 ipos = pos / tile_size;
+					int idx = ipos.y*tile_w + ipos.x;
+					state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
+					tile_list->push_front(idx);
 					cur_tiles++;
-					tile_index++;
 
 					if(cur_tiles == tiles_per_device) {
 						tile_list++;
@@ -271,27 +295,28 @@ int TileManager::gen_tiles(bool sliced)
 					break;
 			}
 		}
-		return tile_index;
+		return tile_w*tile_h;
 	}
 
+	int idx = 0;
 	for(int slice = 0; slice < slice_num; slice++) {
 		int slice_y = (image_h/slice_num)*slice;
 		int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num;
 
-		int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x;
-		int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y;
+		int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y);
 
-		int tiles_per_device = (tile_w * tile_h + num - 1) / num;
+		int tiles_per_device = divide_up(tile_w * tile_h, num);
 		int cur_device = 0, cur_tiles = 0;
 
 		for(int tile_y = 0; tile_y < tile_h; tile_y++) {
-			for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) {
+			for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
 				int x = tile_x * tile_size.x;
 				int y = tile_y * tile_size.y;
 				int w = (tile_x == tile_w-1)? image_w - x: tile_size.x;
 				int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y;
 
-				tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device));
+				state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER));
+				tile_list->push_back(idx);
 
 				if(!sliced) {
 					cur_tiles++;
@@ -299,7 +324,7 @@ int TileManager::gen_tiles(bool sliced)
 					if(cur_tiles == tiles_per_device) {
 						/* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */
 						if(tile_order != TILE_BOTTOM_TO_TOP) {
-							tile_list->sort(TileComparator(tile_order, center));
+							tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
 						}
 						tile_list++;
 						cur_tiles = 0;
@@ -313,7 +338,7 @@ int TileManager::gen_tiles(bool sliced)
 		}
 	}
 
-	return tile_index;
+	return idx;
 }
 
 void TileManager::set_tiles()
@@ -333,15 +358,111 @@ void TileManager::set_tiles()
 	state.buffer.full_height = max(1, params.full_height/resolution);
 }
 
-bool TileManager::next_tile(Tile& tile, int device)
+int TileManager::get_neighbor_index(int index, int neighbor)
+{
+	static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+
+	int resolution = state.resolution_divider;
+	int image_w = max(1, params.width/resolution);
+	int image_h = max(1, params.height/resolution);
+	int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x);
+	int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y);
+
+	int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor];
+	if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+		return -1;
+
+	return ny*state.tile_stride + nx;
+}
+
+/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */
+bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+{
+	if(index < 0 || state.tiles[index].state < min_state) {
+		return false;
+	}
+	for(int neighbor = 0; neighbor < 9; neighbor++) {
+		int nindex = get_neighbor_index(index, neighbor);
+		/* Out-of-bounds tiles don't matter. */
+		if(nindex >= 0 && state.tiles[nindex].state < min_state) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */
+bool TileManager::finish_tile(int index, bool &delete_tile)
+{
+	delete_tile = false;
+
+	switch(state.tiles[index].state) {
+		case Tile::RENDER:
+		{
+			if(!schedule_denoising) {
+				state.tiles[index].state = Tile::DONE;
+				delete_tile = true;
+				return true;
+			}
+			state.tiles[index].state = Tile::RENDERED;
+			/* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */
+			for(int neighbor = 0; neighbor < 9; neighbor++) {
+				int nindex = get_neighbor_index(index, neighbor);
+				if(check_neighbor_state(nindex, Tile::RENDERED)) {
+					state.tiles[nindex].state = Tile::DENOISE;
+					state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
+				}
+			}
+			return false;
+		}
+		case Tile::DENOISE:
+		{
+			state.tiles[index].state = Tile::DENOISED;
+			/* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */
+			for(int neighbor = 0; neighbor < 9; neighbor++) {
+				int nindex = get_neighbor_index(index, neighbor);
+				if(check_neighbor_state(nindex, Tile::DENOISED)) {
+					state.tiles[nindex].state = Tile::DONE;
+					/* It can happen that the tile just finished denoising and already can be freed here.
+					 * However, in that case it still has to be written before deleting, so we can't delete it yet. */
+					if(neighbor == 8) {
+						delete_tile = true;
+					}
+					else {
+						delete state.tiles[nindex].buffers;
+						state.tiles[nindex].buffers = NULL;
+					}
+				}
+			}
+			return true;
+		}
+		default:
+			assert(false);
+			return true;
+	}
+}
+
+bool TileManager::next_tile(Tile* &tile, int device)
 {
 	int logical_device = preserve_tile_device? device: 0;
 
-	if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty())
+	if(logical_device >= state.render_tiles.size())
+		return false;
+
+	if(!state.denoising_tiles[logical_device].empty()) {
+		int idx = state.denoising_tiles[logical_device].front();
+		state.denoising_tiles[logical_device].pop_front();
+		tile = &state.tiles[idx];
+		return true;
+	}
+
+	if(state.render_tiles[logical_device].empty())
 		return false;
 
-	tile = Tile(state.tiles[logical_device].front());
-	state.tiles[logical_device].pop_front();
+	int idx = state.render_tiles[logical_device].front();
+	state.render_tiles[logical_device].pop_front();
+	tile = &state.tiles[idx];
 	return true;
 }
 
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 622b89f7670..e39a8f0627a 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -31,12 +31,20 @@ public:
 	int index;
 	int x, y, w, h;
 	int device;
+	/* RENDER: The tile has to be rendered.
+	 * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
+	 * DENOISE: The tile can be denoised now.
+	 * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
+	 * DONE: The tile is finished and has been freed. */
+	typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
+	State state;
+	RenderBuffers *buffers;
 
 	Tile()
 	{}
 
-	Tile(int index_, int x_, int y_, int w_, int h_, int device_)
-	: index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {}
+	Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
+	: index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {}
 };
 
 /* Tile order */
@@ -58,6 +66,8 @@ public:
 	BufferParams params;
 
 	struct State {
+		vector<Tile> tiles;
+		int tile_stride;
 		BufferParams buffer;
 		int sample;
 		int num_samples;
@@ -67,9 +77,12 @@ public:
 		/* Total samples over all pixels: Generally num_samples*num_pixels,
 		 * but can be higher due to the initial resolution division for previews. */
 		uint64_t total_pixel_samples;
-		/* This vector contains a list of tiles for every logical device in the session.
-		 * In each list, the tiles are sorted according to the tile order setting. */
-		vector<list<Tile> > tiles;
+
+		/* These lists contain the indices of the tiles to be rendered/denoised and are used
+		 * when acquiring a new tile for the device.
+		 * Each list in each vector is for one logical device. */
+		vector<list<int> > render_tiles;
+		vector<list<int> > denoising_tiles;
 	} state;
 
 	int num_samples;
@@ -78,10 +91,12 @@ public:
 	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
 	~TileManager();
 
+	void free_device();
 	void reset(BufferParams& params, int num_samples);
 	void set_samples(int num_samples);
 	bool next();
-	bool next_tile(Tile& tile, int device = 0);
+	bool next_tile(Tile* &tile, int device = 0);
+	bool finish_tile(int index, bool& delete_tile);
 	bool done();
 
 	void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; }
@@ -96,6 +111,9 @@ public:
 
 	/* Get number of actual samples to render. */
 	int get_num_effective_samples();
+
+	/* Schedule tiles for denoising after they've been rendered. */
+	bool schedule_denoising;
 protected:
 
 	void set_tiles();
@@ -127,6 +145,9 @@ protected:
 
 	/* Generate tile list, return number of tiles. */
 	int gen_tiles(bool sliced);
+
+	int get_neighbor_index(int index, int neighbor);
+	bool check_neighbor_state(int index, Tile::State state);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 388aba65460..43f9a57d099 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -59,6 +59,7 @@ set(SRC_HEADERS
 	util_math_int2.h
 	util_math_int3.h
 	util_math_int4.h
+	util_math_matrix.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 52b4fa859b7..12abd8e201e 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -160,6 +160,78 @@ ccl_device_inline float max4(float a, float b, float c, float d)
 }
 
 #ifndef __KERNEL_OPENCL__
+/* Int/Float conversion */
+
+ccl_device_inline int as_int(uint i)
+{
+	union { uint ui; int i; } u;
+	u.ui = i;
+	return u.i;
+}
+
+ccl_device_inline uint as_uint(int i)
+{
+	union { uint ui; int i; } u;
+	u.i = i;
+	return u.ui;
+}
+
+ccl_device_inline uint as_uint(float f)
+{
+	union { uint i; float f; } u;
+	u.f = f;
+	return u.i;
+}
+
+ccl_device_inline int __float_as_int(float f)
+{
+	union { int i; float f; } u;
+	u.f = f;
+	return u.i;
+}
+
+ccl_device_inline float __int_as_float(int i)
+{
+	union { int i; float f; } u;
+	u.i = i;
+	return u.f;
+}
+
+ccl_device_inline uint __float_as_uint(float f)
+{
+	union { uint i; float f; } u;
+	u.f = f;
+	return u.i;
+}
+
+ccl_device_inline float __uint_as_float(uint i)
+{
+	union { uint i; float f; } u;
+	u.i = i;
+	return u.f;
+}
+#endif /* __KERNEL_OPENCL__ */
+
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
+{
+	unsigned int x = __float_as_uint(f);
+	return (x << 1) > 0xff000000u;
+}
+
+ccl_device_inline bool isfinite_safe(float f)
+{
+	/* By IEEE 754 rule, 2*Inf equals Inf */
+	unsigned int x = __float_as_uint(f);
+	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
+}
+
+ccl_device_inline float ensure_finite(float v)
+{
+	return isfinite_safe(v)? v : 0.0f;
+}
+
+#ifndef __KERNEL_OPENCL__
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
 	return min(max(a, mn), mx);
@@ -250,57 +322,6 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPENCL__
-/* Int/Float conversion */
-
-ccl_device_inline int as_int(uint i)
-{
-	union { uint ui; int i; } u;
-	u.ui = i;
-	return u.i;
-}
-
-ccl_device_inline uint as_uint(int i)
-{
-	union { uint ui; int i; } u;
-	u.i = i;
-	return u.ui;
-}
-
-ccl_device_inline uint as_uint(float f)
-{
-	union { uint i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline int __float_as_int(float f)
-{
-	union { int i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline float __int_as_float(int i)
-{
-	union { int i; float f; } u;
-	u.i = i;
-	return u.f;
-}
-
-ccl_device_inline uint __float_as_uint(float f)
-{
-	union { uint i; float f; } u;
-	u.f = f;
-	return u.i;
-}
-
-ccl_device_inline float __uint_as_float(uint i)
-{
-	union { uint i; float f; } u;
-	u.i = i;
-	return u.f;
-}
-
 /* Interpolation */
 
 template<class A, class B> A lerp(const A& a, const A& b, const B& t)
@@ -318,20 +339,6 @@ ccl_device_inline float triangle_area(const float3& v1,
 }
 #endif  /* __KERNEL_OPENCL__ */
 
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
-	unsigned int x = __float_as_uint(f);
-	return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
-	/* By IEEE 754 rule, 2*Inf equals Inf */
-	unsigned int x = __float_as_uint(f);
-	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
-}
-
 /* Orthonormal vectors */
 
 ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index e0c6b551040..a754be413fe 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -38,6 +38,7 @@ ccl_device_inline float3 operator/(const float3& a, const float3& b);
 ccl_device_inline float3 operator+(const float3& a, const float3& b);
 ccl_device_inline float3 operator-(const float3& a, const float3& b);
 ccl_device_inline float3 operator+=(float3& a, const float3& b);
+ccl_device_inline float3 operator-=(float3& a, const float3& b);
 ccl_device_inline float3 operator*=(float3& a, const float3& b);
 ccl_device_inline float3 operator*=(float3& a, float f);
 ccl_device_inline float3 operator/=(float3& a, const float3& b);
@@ -166,6 +167,11 @@ ccl_device_inline float3 operator+=(float3& a, const float3& b)
 	return a = a + b;
 }
 
+ccl_device_inline float3 operator-=(float3& a, const float3& b)
+{
+	return a = a - b;
+}
+
 ccl_device_inline float3 operator*=(float3& a, const float3& b)
 {
 	return a = a * b;
@@ -360,6 +366,15 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 	return a == b;
 #endif
 }
+
+ccl_device_inline float3 ensure_finite3(float3 v)
+{
+	if(!isfinite_safe(v.x)) v.x = 0.0;
+	if(!isfinite_safe(v.y)) v.y = 0.0;
+	if(!isfinite_safe(v.z)) v.z = 0.0;
+	return v;
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
index 4b327c90c33..79a8c0841e7 100644
--- a/intern/cycles/util/util_math_int4.h
+++ b/intern/cycles/util/util_math_int4.h
@@ -103,6 +103,15 @@ ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b)
 	                 (mask.w)? a.w: b.w);
 #endif
 }
+
+ccl_device_inline int4 load_int4(const int *v)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_loadu_si128((__m128i*)v));
+#else
+	return make_int4(v[0], v[1], v[2], v[3]);
+#endif
+}
 #endif  /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
new file mode 100644
index 00000000000..31ea10f18a8
--- /dev/null
+++ b/intern/cycles/util/util_math_matrix.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_MATRIX_H__
+#define __UTIL_MATH_MATRIX_H__
+
+CCL_NAMESPACE_BEGIN
+
+#define MAT(A, size, row, col) A[(row)*(size)+(col)]
+
+/* Variants that use a constant stride on GPUS. */
+#ifdef __KERNEL_GPU__
+#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+/* Element access when only the lower-triangular elements are stored. */
+#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+#define VECS(V, i, s) V[(i)*(s)]
+#else
+#define MATS(A, n, r, c, s) MAT(A, n, r, c)
+#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+#define VECS(V, i, s) V[i]
+#endif
+
+/* Zeroing helpers. */
+
+ccl_device_inline void math_vector_zero(float *v, int n)
+{
+	for(int i = 0; i < n; i++)
+		v[i] = 0.0f;
+}
+
+ccl_device_inline void math_matrix_zero(float *A, int n)
+{
+	for(int row = 0; row < n; row++)
+		for(int col = 0; col <= row; col++)
+			MAT(A, n, row, col) = 0.0f;
+}
+
+/* Elementary vector operations. */
+
+ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n)
+{
+	for(int i = 0; i < n; i++)
+		a[i] += b[i];
+}
+
+ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n)
+{
+	for(int i = 0; i < n; i++)
+		a[i] *= b[i];
+}
+
+ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n)
+{
+	for(int i = 0; i < n; i++)
+		a[i*astride] *= b[i];
+}
+
+ccl_device_inline void math_vector_scale(float *a, float b, int n)
+{
+	for(int i = 0; i < n; i++)
+		a[i] *= b;
+}
+
+ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n)
+{
+	for(int i = 0; i < n; i++)
+		a[i] = max(a[i], b[i]);
+}
+
+ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
+{
+	for(int i = 0; i < n; i++)
+		v[i] += w*x[i];
+}
+
+ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
+{
+	for(int i = 0; i < n; i++)
+		v[i*stride] += w*x[i];
+}
+
+/* Elementary matrix operations.
+ * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
+
+ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
+{
+	for(int row = 0; row < n; row++)
+		MATHS(A, row, row, stride) += val;
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian(float *A,
+                                                  int n,
+                                                  float ccl_restrict_ptr v,
+                                                  float weight)
+{
+	for(int row = 0; row < n; row++)
+		for(int col = 0; col <= row; col++)
+			MAT(A, n, row, col) += v[row]*v[col]*weight;
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
+                                                          int n,
+                                                          float ccl_restrict_ptr v,
+                                                          float weight,
+                                                          int stride)
+{
+	for(int row = 0; row < n; row++)
+		for(int col = 0; col <= row; col++)
+			MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+}
+
+/* Transpose matrix A inplace. */
+ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
+{
+	for(int i = 0; i < n; i++) {
+		for(int j = 0; j < i; j++) {
+			float temp = MATS(A, n, i, j, stride);
+			MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
+			MATS(A, n, j, i, stride) = temp;
+		}
+	}
+}
+
+
+
+
+/* Solvers for matrix problems */
+
+/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
+ * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
+ * Also, only the lower triangular part of A is ever accessed. */
+ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			float sum_col = MATHS(A, row, col, stride);
+			for(int k = 0; k < col; k++) {
+				sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
+			}
+			if(row == col) {
+				sum_col = sqrtf(max(sum_col, 0.0f));
+			}
+			else {
+				sum_col /= MATHS(A, col, col, stride);
+			}
+			MATHS(A, row, col, stride) = sum_col;
+		}
+	}
+}
+
+/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process.
+ *
+ * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
+ * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
+ * Since L is lower triangular, finding b is relatively easy since y is known.
+ * Then, the remaining problem is Lt*S = b, which again can be solved easily.
+ *
+ * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
+ * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
+ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
+{
+	math_trimatrix_add_diagonal(A, n, 1e-4f, stride); /* Improve the numerical stability. */
+	math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
+
+	/* Use forward substitution to solve L*b = y, replacing y by b. */
+	for(int row = 0; row < n; row++) {
+		float3 sum = VECS(y, row, stride);
+		for(int col = 0; col < row; col++)
+			sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+	}
+
+	/* Use backward substitution to solve Lt*S = b, replacing b by S. */
+	for(int row = n-1; row >= 0; row--) {
+		float3 sum = VECS(y, row, stride);
+		for(int col = row+1; col < n; col++)
+			sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
+		VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
+	}
+}
+
+
+
+
+
+/* Perform the Jacobi Eigenvalue Methon on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
+ * The algorithm overwrites the contents of A.
+ *
+ * After returning, A will be overwritten with D, which is (almost) diagonal,
+ * and V will contain the eigenvectors of the original A in its rows (!),
+ * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
+ */
+ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride)
+{
+	const float singular_epsilon = 1e-9f;
+
+	for (int row = 0; row < n; row++)
+		for (int col = 0; col < n; col++)
+			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+
+	for (int sweep = 0; sweep < 8; sweep++) {
+		float off_diagonal = 0.0f;
+		for (int row = 1; row < n; row++)
+			for (int col = 0; col < row; col++)
+				off_diagonal += fabsf(MAT(A, n, row, col));
+		if (off_diagonal < 1e-7f) {
+			/* The matrix has nearly reached diagonal form.
+			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
+			break;
+		}
+
+		/* Set the threshold for the small element rotation skip in the first sweep:
+		 * Skip all elements that are less than a tenth of the average off-diagonal element. */
+		float threshold = 0.2f*off_diagonal / (n*n);
+
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
+				/* Perform a Jacobi rotation on this element that reduces it to zero. */
+				float element = MAT(A, n, row, col);
+				float abs_element = fabsf(element);
+
+				/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
+				if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+					MAT(A, n, row, col) = 0.0f;
+					continue;
+				}
+
+				if(element == 0.0f) {
+					continue;
+				}
+
+				/* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */
+				if(sweep < 3 && (abs_element < threshold)) {
+					continue;
+				}
+
+				/* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi).
+				 * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case.
+				 * Then, we compute sin(phi) and cos(phi) themselves. */
+				float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
+				float ratio;
+				if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+					float cot_2phi = 0.5f*singular_diff / element;
+					ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
+					if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+				}
+				else {
+					ratio = element / singular_diff;
+				}
+
+				float c = 1.0f / sqrtf(1.0f + ratio*ratio);
+				float s = ratio*c;
+				/* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */
+				float tan_phi_2 = s / (1.0f + c);
+
+				/* Update the singular values in the diagonal. */
+				float singular_delta = ratio*element;
+				MAT(A, n, row, row) += singular_delta;
+				MAT(A, n, col, col) -= singular_delta;
+
+				/* Set the element itself to zero. */
+				MAT(A, n, row, col) = 0.0f;
+
+				/* Perform the actual rotations on the matrices. */
+#define ROT(M, r1, c1, r2, c2, stride)                                   \
+				{                                                        \
+					float M1 = MATS(M, n, r1, c1, stride);               \
+					float M2 = MATS(M, n, r2, c2, stride);               \
+					MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \
+					MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \
+				}
+
+				/* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */
+				for(int i = 0    ; i < col; i++) ROT(A, col, i, row, i, 1);
+				for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1);
+				for(int i = row+1; i < n  ; i++) ROT(A, i, col, i, row, 1);
+
+				for(int i = 0    ; i < n  ; i++) ROT(V, col, i, row, i, v_stride);
+#undef ROT
+			}
+		}
+	}
+
+	/* Sort eigenvalues and the associated eigenvectors. */
+	for (int i = 0; i < n - 1; i++) {
+		float v = MAT(A, n, i, i);
+		int k = i;
+		for (int j = i; j < n; j++) {
+			if (MAT(A, n, j, j) >= v) {
+				v = MAT(A, n, j, j);
+				k = j;
+			}
+		}
+		if (k != i) {
+			/* Swap eigenvalues. */
+			MAT(A, n, k, k) = MAT(A, n, i, i);
+			MAT(A, n, i, i) = v;
+			/* Swap eigenvectors. */
+			for (int j = 0; j < n; j++) {
+				float v = MATS(V, n, i, j, v_stride);
+				MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
+				MATS(V, n, k, j, v_stride) = v;
+			}
+		}
+	}
+}
+
+#ifdef __KERNEL_SSE3__
+
+ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+{
+	for(int i = 0; i < n; i++)
+		A[i] = _mm_setzero_ps();
+}
+ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+{
+	for(int row = 0; row < n; row++)
+		for(int col = 0; col <= row; col++)
+			MAT(A, n, row, col) = _mm_setzero_ps();
+}
+
+/* Add Gramian matrix of v to A.
+ * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
+ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight)
+{
+	for(int row = 0; row < n; row++)
+		for(int col = 0; col <= row; col++)
+			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+}
+
+ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
+{
+	for(int i = 0; i < n; i++)
+		V[i] = _mm_add_ps(V[i], a[i]);
+}
+
+ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a)
+{
+	for(int i = 0; i < n; i++)
+		V[i] = _mm_mul_ps(V[i], a[i]);
+}
+
+ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n)
+{
+	for(int i = 0; i < n; i++)
+		a[i] = _mm_max_ps(a[i], b[i]);
+}
+
+ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B)
+{
+	for(int row = 0; row < n; row++)
+		for(int col = 0; col <= row; col++)
+			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+}
+#endif
+
+#undef MAT
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_MATH_MATRIX_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 557809a5719..545a3399f32 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v)
 static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
 static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 
+#ifdef __KERNEL_SSE3__
 /* Emulation of SSE4 functions with SSE3 */
-
-#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__)
+#  ifndef __KERNEL_SSE41__
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -362,7 +362,7 @@ __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
   char* _r = (char*)(&rvalue + 1);
   char* _v = (char*)(& value + 1);
   char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))*  *((int32*)(_i + i));
+  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
   return rvalue;
 }
 
@@ -395,7 +395,7 @@ __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int inde
 
 #define _mm_extract_ps __emu_mm_extract_ps
 __forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32* ptr = (int32*)&input; return ptr[index];
+  int32_t* ptr = (int32_t*)&input; return ptr[index];
 }
 
 #define _mm_insert_ps __emu_mm_insert_ps
@@ -415,7 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#ifdef _M_X64
+#    ifdef _M_X64
 #define _mm_insert_epi64 __emu_mm_insert_epi64
 __forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
     assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
@@ -426,7 +426,40 @@ __forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) {
     assert(size_t(index) < 2); 
     return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
 }
-#endif
+#    endif
+
+#  endif
+
+#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
+
+/* Return a __m128 with every element set to the largest element of v. */
+ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
+{
+  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
+  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
+  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
+  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
+  return v;
+}
+
+/* Return the sum of the four elements of x. */
+ccl_device_inline float _mm_hsum_ss(__m128 x)
+{
+    __m128 a = _mm_movehdup_ps(x);
+    __m128 b = _mm_add_ps(x, a);
+    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
+}
+
+/* Return a __m128 with every element set to the sum of the four elements of x. */
+ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
+{
+    x = _mm_hadd_ps(x, x);
+    x = _mm_hadd_ps(x, x);
+    return x;
+}
+
+/* Replace elements of x with zero where mask isn't set. */
+#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
 
 #endif
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index feab7996aee..0039c59ec48 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -133,6 +133,11 @@ ccl_device_inline size_t align_up(size_t offset, size_t alignment)
 	return (offset + alignment - 1) & ~(alignment - 1);
 }
 
+ccl_device_inline size_t divide_up(size_t x, size_t y)
+{
+	return (x + y - 1) / y;
+}
+
 ccl_device_inline size_t round_up(size_t x, size_t multiple)
 {
 	return ((x + multiple - 1) / multiple) * multiple;
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-07 15:40:58 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-07 15:40:58 +0300
commit	43b374e8c5430488a302298b1026faa1c3a231e9 (patch)
tree	42e619a9fa08d02cef515b6315ce34dd7fd062b2 /intern/cycles
parent	bca697834728fd12c84941aa2a428abfe2090b27 (diff)