105 files changed, 1176 insertions, 644 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index b5149b5082e..3018fd5b316 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -239,7 +239,8 @@ def register_passes(engine, scene, srl):
     if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X", 'VALUE')
     if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X", 'VALUE')
 
-    if crl.use_denoising and crl.denoising_store_passes:
+    cscene = scene.cycles
+    if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine:
         engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
         engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
         engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 8bb25aba13c..68474529ed3 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -695,10 +695,17 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             update=devices_update_callback
             )
 
-        cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=True, update=devices_update_callback);
+        cls.debug_opencl_kernel_single_program = BoolProperty(
+            name="Single Program",
+            default=True,
+            update=devices_update_callback,
+            )
 
         cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
 
+        cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0,
+            description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)")
+
     @classmethod
     def unregister(cls):
         del bpy.types.Scene.cycles
@@ -1209,6 +1216,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
                 name="Use Denoising",
                 description="Denoise the rendered image",
                 default=False,
+                update=update_render_passes,
                 )
         cls.denoising_diffuse_direct = BoolProperty(
                 name="Diffuse Direct",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 4ed3ccd9a2c..49beebe5ab4 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -531,17 +531,17 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_environment")
 
         if context.scene.cycles.feature_set == 'EXPERIMENTAL':
-           col.separator()
-           sub = col.column()
-           sub.active = crl.use_denoising
-           sub.prop(crl, "denoising_store_passes", text="Denoising")
+            col.separator()
+            sub = col.column()
+            sub.active = crl.use_denoising
+            sub.prop(crl, "denoising_store_passes", text="Denoising")
 
         if _cycles.with_cycles_debug:
-          col = layout.column()
-          col.prop(crl, "pass_debug_bvh_traversed_nodes")
-          col.prop(crl, "pass_debug_bvh_traversed_instances")
-          col.prop(crl, "pass_debug_bvh_intersections")
-          col.prop(crl, "pass_debug_ray_bounces")
+            col = layout.column()
+            col.prop(crl, "pass_debug_bvh_traversed_nodes")
+            col.prop(crl, "pass_debug_bvh_traversed_instances")
+            col.prop(crl, "pass_debug_bvh_intersections")
+            col.prop(crl, "pass_debug_ray_bounces")
 
 
 class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
@@ -1608,6 +1608,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         col.prop(cscene, "debug_opencl_device_type", text="Device")
         col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
+        col.prop(cscene, "debug_opencl_mem_limit")
 
 
 class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
@@ -1710,7 +1711,7 @@ def draw_device(self, context):
 
         layout.prop(cscene, "feature_set")
 
-        split = layout.split(percentage=1/3)
+        split = layout.split(percentage=1 / 3)
         split.label("Device:")
         row = split.row()
         row.active = show_device_active(context)
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 01570b1e3f9..54973fd1b7f 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -106,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+	flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024;
 	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index c6a59577507..2b5dd5eadea 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -399,14 +399,7 @@ void BlenderSession::render()
 		BL::RenderLayer b_rlay = *b_single_rlay;
 
 		/* add passes */
-		array<Pass> passes;
-		if(session_params.device.advanced_shading) {
-			passes = sync->sync_render_passes(b_rlay, *b_layer_iter);
-		}
-		else {
-			Pass::add(PASS_COMBINED, passes);
-		}
-
+		array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
 		buffer_params.passes = passes;
 
 		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 41723599874..3a00384458a 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -553,11 +553,16 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
 }
 
 array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
-                                            BL::SceneRenderLayer& b_srlay)
+                                            BL::SceneRenderLayer& b_srlay,
+                                            const SessionParams &session_params)
 {
 	array<Pass> passes;
 	Pass::add(PASS_COMBINED, passes);
 
+	if(!session_params.device.advanced_shading) {
+		return passes;
+	}
+
 	/* loop over passes */
 	BL::RenderLayer::passes_iterator b_pass_iter;
 
@@ -572,7 +577,9 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 	}
 
 	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
-	if(get_boolean(crp, "denoising_store_passes")) {
+	if(get_boolean(crp, "denoising_store_passes") &&
+	   get_boolean(crp, "use_denoising") &&
+	   !session_params.progressive_refine) {
 		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
 		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
 		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 0950285d976..4ec46424b5a 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -68,7 +68,8 @@ public:
 	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
 	array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
-	                               BL::SceneRenderLayer& b_srlay);
+	                               BL::SceneRenderLayer& b_srlay,
+	                               const SessionParams &session_params);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings& b_render,
 	                 BL::Object& b_override,
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 0603ecb3afb..a54bb77f9f3 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -68,6 +68,8 @@ std::ostream& operator <<(std::ostream &os,
 	   << string_from_bool(requested_features.use_transparent) << std::endl;
 	os << "Use Principled BSDF: "
 	   << string_from_bool(requested_features.use_principled) << std::endl;
+	os << "Use Denoising: "
+	   << string_from_bool(requested_features.use_denoising) << std::endl;
 	return os;
 }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 527940e8f50..b3b693c630c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -127,6 +127,9 @@ public:
 	/* Per-uber shader usage flags. */
 	bool use_principled;
 
+	/* Denoising features. */
+	bool use_denoising;
+
 	DeviceRequestedFeatures()
 	{
 		/* TODO(sergey): Find more meaningful defaults. */
@@ -145,6 +148,7 @@ public:
 		use_transparent = false;
 		use_shadow_tricks = false;
 		use_principled = false;
+		use_denoising = false;
 	}
 
 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -163,7 +167,8 @@ public:
 		         use_patch_evaluation == requested_features.use_patch_evaluation &&
 		         use_transparent == requested_features.use_transparent &&
 		         use_shadow_tricks == requested_features.use_shadow_tricks &&
-		         use_principled == requested_features.use_principled);
+		         use_principled == requested_features.use_principled &&
+		         use_denoising == requested_features.use_denoising);
 	}
 
 	/* Convert the requested features structure to a build options,
@@ -213,6 +218,9 @@ public:
 		if(!use_principled) {
 			build_options += " -D__NO_PRINCIPLED__";
 		}
+		if(!use_denoising) {
+			build_options += " -D__NO_DENOISING__";
+		}
 		return build_options;
 	}
 };
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c2f74aa8903..18112437b45 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -149,7 +149,8 @@ public:
 	                                            device_memory& use_queues_flag,
 	                                            device_memory& work_pool_wgs);
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&);
 	virtual int2 split_kernel_local_size();
 	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
 	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
@@ -185,9 +186,9 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
 	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                                              filter_construct_transform_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                                       filter_finalize_kernel;
+	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+	KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
 
 	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
 	                       ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -248,6 +249,7 @@ public:
 		REGISTER_SPLIT_KERNEL(direct_lighting);
 		REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
 		REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+		REGISTER_SPLIT_KERNEL(enqueue_inactive);
 		REGISTER_SPLIT_KERNEL(next_iteration_setup);
 		REGISTER_SPLIT_KERNEL(indirect_subsurface);
 		REGISTER_SPLIT_KERNEL(buffer_update);
@@ -465,8 +467,6 @@ public:
 
 	bool denoising_reconstruct(device_ptr color_ptr,
 	                           device_ptr color_variance_ptr,
-	                           device_ptr guide_ptr,
-	                           device_ptr guide_variance_ptr,
 	                           device_ptr output_ptr,
 	                           DenoisingTask *task)
 	{
@@ -485,8 +485,8 @@ public:
 			                     task->reconstruction_state.source_w - max(0, dx),
 			                     task->reconstruction_state.source_h - max(0, dy)};
 			filter_nlm_calc_difference_kernel()(dx, dy,
-			                                    (float*) guide_ptr,
-			                                    (float*) guide_variance_ptr,
+			                                    (float*) color_ptr,
+			                                    (float*) color_variance_ptr,
 			                                    difference,
 			                                    local_rect,
 			                                    task->buffer.w,
@@ -499,8 +499,6 @@ public:
 			filter_nlm_construct_gramian_kernel()(dx, dy,
 			                                      blurDifference,
 			                                      (float*)  task->buffer.mem.device_pointer,
-			                                      (float*)  color_ptr,
-			                                      (float*)  color_variance_ptr,
 			                                      (float*)  task->storage.transform.device_pointer,
 			                                      (int*)    task->storage.rank.device_pointer,
 			                                      (float*)  task->storage.XtWX.device_pointer,
@@ -648,7 +646,7 @@ public:
 		DenoisingTask denoising(this);
 
 		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
-		denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
 		denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
 		denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
 		denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
@@ -935,7 +933,8 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
 	return true;
 }
 
-SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
+                                                               const DeviceRequestedFeatures&)
 {
 	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
 
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 99537e9a983..3a29538aa13 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -105,7 +105,8 @@ public:
 	                                            device_memory& use_queues_flag,
 	                                            device_memory& work_pool_wgs);
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&);
 	virtual int2 split_kernel_local_size();
 	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
 };
@@ -1051,8 +1052,6 @@ public:
 
 	bool denoising_reconstruct(device_ptr color_ptr,
 	                           device_ptr color_variance_ptr,
-	                           device_ptr guide_ptr,
-	                           device_ptr guide_variance_ptr,
 	                           device_ptr output_ptr,
 	                           DenoisingTask *task)
 	{
@@ -1096,8 +1095,8 @@ public:
 			                     task->reconstruction_state.source_h - max(0, dy)};
 
 			void *calc_difference_args[] = {&dx, &dy,
-			                                &guide_ptr,
-			                                &guide_variance_ptr,
+			                                &color_ptr,
+			                                &color_variance_ptr,
 			                                &difference,
 			                                &local_rect,
 			                                &task->buffer.w,
@@ -1126,8 +1125,6 @@ public:
 			void *construct_gramian_args[] = {&dx, &dy,
 			                                  &blurDifference,
 			                                  &task->buffer.mem.device_pointer,
-			                                  &color_ptr,
-			                                  &color_variance_ptr,
 			                                  &task->storage.transform.device_pointer,
 			                                  &task->storage.rank.device_pointer,
 			                                  &task->storage.XtWX.device_pointer,
@@ -1294,7 +1291,7 @@ public:
 		DenoisingTask denoising(this);
 
 		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
 		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
 		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
 		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
@@ -2041,7 +2038,8 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
 	return !device->have_error();
 }
 
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
+                                                                const DeviceRequestedFeatures&)
 {
 	CUfunction func;
 
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 613bd9112cf..619cc1d171e 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -215,7 +215,7 @@ bool DenoisingTask::run_denoising()
 	{
 		device_sub_ptr color_ptr    (device, buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
 		device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
-		functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
+		functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr);
 	}
 
 	device->mem_free(storage.XtWX);
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 25b93c2ad74..def7b72f67d 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -58,8 +58,6 @@ public:
 		              )> non_local_means;
 		function<bool(device_ptr color_ptr,
 		              device_ptr color_variance_ptr,
-		              device_ptr guide_ptr,
-		              device_ptr guide_variance_ptr,
 		              device_ptr output_ptr
 		              )> reconstruct;
 		function<bool()> construct_transform;
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index edd2047debc..681b8214b03 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -130,10 +130,22 @@ string device_opencl_capabilities(void)
 		opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
 		result += string_printf("%s: %s\n", name, data); \
 	} while(false)
+#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
+	do { \
+		char data[1024] = "\0"; \
+		size_t length = 0; \
+		if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
+			if(length != 0 && data[0] != '\0') { \
+				result += string_printf("%s: %s\n", name, data); \
+			} \
+		} \
+	} while(false)
 #define APPEND_PLATFORM_STRING_INFO(id, name, what) \
 	APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what)
 #define APPEND_DEVICE_STRING_INFO(id, name, what) \
 	APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
+#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
+	APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
 
 	vector<cl_device_id> device_ids;
 	for(cl_uint platform = 0; platform < num_platforms; ++platform) {
@@ -167,6 +179,7 @@ string device_opencl_capabilities(void)
 			result += string_printf("\t\tDevice: #%u\n", device);
 
 			APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
+			APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
 			APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
 			APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
 			APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index dddd19f179f..d2b3a89fa98 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -47,6 +47,7 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
 	kernel_direct_lighting = NULL;
 	kernel_shadow_blocked_ao = NULL;
 	kernel_shadow_blocked_dl = NULL;
+	kernel_enqueue_inactive = NULL;
 	kernel_next_iteration_setup = NULL;
 	kernel_indirect_subsurface = NULL;
 	kernel_buffer_update = NULL;
@@ -74,6 +75,7 @@ DeviceSplitKernel::~DeviceSplitKernel()
 	delete kernel_direct_lighting;
 	delete kernel_shadow_blocked_ao;
 	delete kernel_shadow_blocked_dl;
+	delete kernel_enqueue_inactive;
 	delete kernel_next_iteration_setup;
 	delete kernel_indirect_subsurface;
 	delete kernel_buffer_update;
@@ -101,6 +103,7 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 	LOAD_KERNEL(direct_lighting);
 	LOAD_KERNEL(shadow_blocked_ao);
 	LOAD_KERNEL(shadow_blocked_dl);
+	LOAD_KERNEL(enqueue_inactive);
 	LOAD_KERNEL(next_iteration_setup);
 	LOAD_KERNEL(indirect_subsurface);
 	LOAD_KERNEL(buffer_update);
@@ -256,6 +259,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
 				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 68c2ba974a5..9c42cb58520 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -69,6 +69,7 @@ private:
 	SplitKernelFunction *kernel_direct_lighting;
 	SplitKernelFunction *kernel_shadow_blocked_ao;
 	SplitKernelFunction *kernel_shadow_blocked_dl;
+	SplitKernelFunction *kernel_enqueue_inactive;
 	SplitKernelFunction *kernel_next_iteration_setup;
 	SplitKernelFunction *kernel_indirect_subsurface;
 	SplitKernelFunction *kernel_buffer_update;
@@ -124,7 +125,8 @@ public:
 	                                            device_memory& use_queues_flag,
 	                                            device_memory& work_pool_wgs) = 0;
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+	                                                       const DeviceRequestedFeatures&) = 0;
 	virtual int2 split_kernel_local_size() = 0;
 	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
 };
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 27e196d1e68..7da690904aa 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -84,7 +84,7 @@ public:
 	                                   string *error = NULL);
 	static bool device_version_check(cl_device_id device,
 	                                 string *error = NULL);
-	static string get_hardware_id(string platform_name,
+	static string get_hardware_id(const string& platform_name,
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
@@ -130,6 +130,11 @@ public:
 	                            cl_int* error = NULL);
 	static cl_device_type get_device_type(cl_device_id device_id);
 
+	static bool get_driver_version(cl_device_id device_id,
+	                               int *major,
+	                               int *minor,
+	                               cl_int* error = NULL);
+
 	static int mem_address_alignment(cl_device_id device_id);
 
 	/* Get somewhat more readable device name.
@@ -242,17 +247,17 @@ public:
 	public:
 		OpenCLProgram() : loaded(false), device(NULL) {}
 		OpenCLProgram(OpenCLDeviceBase *device,
-		              string program_name,
-		              string kernel_name,
-		              string kernel_build_options,
+		              const string& program_name,
+		              const string& kernel_name,
+		              const string& kernel_build_options,
 		              bool use_stdout = true);
 		~OpenCLProgram();
 
 		void add_kernel(ustring name);
 		void load();
 
-		bool is_loaded()    { return loaded; }
-		string get_log()    { return log; }
+		bool is_loaded() const { return loaded; }
+		const string& get_log() const { return log; }
 		void report_error();
 
 		cl_kernel operator()();
@@ -266,8 +271,8 @@ public:
 		bool load_binary(const string& clbin, const string *debug_src = NULL);
 		bool save_binary(const string& clbin);
 
-		void add_log(string msg, bool is_debug);
-		void add_error(string msg);
+		void add_log(const string& msg, bool is_debug);
+		void add_error(const string& msg);
 
 		bool loaded;
 		cl_program program;
@@ -390,8 +395,6 @@ protected:
 	bool denoising_construct_transform(DenoisingTask *task);
 	bool denoising_reconstruct(device_ptr color_ptr,
 	                           device_ptr color_variance_ptr,
-	                           device_ptr guide_ptr,
-	                           device_ptr guide_variance_ptr,
 	                           device_ptr output_ptr,
 	                           DenoisingTask *task);
 	bool denoising_combine_halves(device_ptr a_ptr,
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 24b70e3446c..509da7a0a84 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -20,6 +20,7 @@
 
 #include "kernel/kernel_types.h"
 
+#include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_md5.h"
@@ -276,6 +277,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp
 
 	size_t size = mem.memory_size();
 
+	/* check there is enough memory available for the allocation */
+	cl_ulong max_alloc_size = 0;
+	clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
+
+	if(DebugFlags().opencl.mem_limit) {
+		max_alloc_size = min(max_alloc_size,
+		                     cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
+	}
+
+	if(size > max_alloc_size) {
+		string error = "Scene too complex to fit in available memory.";
+		if(name != NULL) {
+			error += string_printf(" (allocating buffer %s failed.)", name);
+		}
+		set_error(error);
+
+		return;
+	}
+
 	cl_mem_flags mem_flag;
 	void *mem_ptr = NULL;
 
@@ -693,8 +713,6 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
 
 bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
                                              device_ptr color_variance_ptr,
-                                             device_ptr guide_ptr,
-                                             device_ptr guide_variance_ptr,
                                              device_ptr output_ptr,
                                              DenoisingTask *task)
 {
@@ -703,8 +721,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 
 	cl_mem color_mem = CL_MEM_PTR(color_ptr);
 	cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-	cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
 	cl_mem output_mem = CL_MEM_PTR(output_ptr);
 
 	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
@@ -735,8 +751,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 
 		kernel_set_args(ckNLMCalcDifference, 0,
 		                dx, dy,
-		                guide_mem,
-		                guide_variance_mem,
+		                color_mem,
+		                color_variance_mem,
 		                difference,
 		                local_rect,
 		                task->buffer.w,
@@ -775,8 +791,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 		                dx, dy,
 		                blurDifference,
 		                buffer_mem,
-		                color_mem,
-		                color_variance_mem,
 		                transform_mem,
 		                rank_mem,
 		                XtWX_mem,
@@ -961,7 +975,7 @@ void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
 
 	denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
 	denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
-	denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
 	denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
 	denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
 	denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
@@ -1232,7 +1246,7 @@ void OpenCLDeviceBase::store_cached_kernel(
 }
 
 string OpenCLDeviceBase::build_options_for_base_program(
-        const DeviceRequestedFeatures& /*requested_features*/)
+        const DeviceRequestedFeatures& requested_features)
 {
 	/* TODO(sergey): By default we compile all features, meaning
 	 * mega kernel is not getting feature-based optimizations.
@@ -1240,6 +1254,14 @@ string OpenCLDeviceBase::build_options_for_base_program(
 	 * Ideally we need always compile kernel with as less features
 	 * enabled as possible to keep performance at it's max.
 	 */
+
+	/* For now disable baking when not in use as this has major
+	 * impact on kernel build times.
+	 */
+	if(!requested_features.use_baking) {
+		return "-D__NO_BAKING__";
+	}
+
 	return "";
 }
 
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 76dcbd6fc9a..76d9983e9a2 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -25,6 +25,7 @@
 
 #include "device/device_split_kernel.h"
 
+#include "util/util_algorithm.h"
 #include "util/util_logging.h"
 #include "util/util_md5.h"
 #include "util/util_path.h"
@@ -176,17 +177,62 @@ protected:
 	friend class OpenCLSplitKernelFunction;
 };
 
+struct CachedSplitMemory {
+	int id;
+	device_memory *split_data;
+	device_memory *ray_state;
+	device_ptr *rng_state;
+	device_memory *queue_index;
+	device_memory *use_queues_flag;
+	device_memory *work_pools;
+	device_ptr *buffer;
+};
+
 class OpenCLSplitKernelFunction : public SplitKernelFunction {
 public:
 	OpenCLDeviceSplitKernel* device;
 	OpenCLDeviceBase::OpenCLProgram program;
+	CachedSplitMemory& cached_memory;
+	int cached_id;
+
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
+			device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
+	{
+	}
 
-	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
-	~OpenCLSplitKernelFunction() { program.release(); }
+	~OpenCLSplitKernelFunction()
+	{
+		program.release();
+	}
 
 	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
 	{
-		device->kernel_set_args(program(), 0, kg, data);
+		if(cached_id != cached_memory.id) {
+			cl_uint start_arg_index =
+				device->kernel_set_args(program(),
+					            0,
+					            kg,
+					            data,
+					            *cached_memory.split_data,
+					            *cached_memory.ray_state,
+					            *cached_memory.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+				device->set_kernel_arg_mem(program(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+
+			start_arg_index +=
+				device->kernel_set_args(program(),
+					            start_arg_index,
+					            *cached_memory.queue_index,
+					            *cached_memory.use_queues_flag,
+					            *cached_memory.work_pools,
+					            *cached_memory.buffer);
+
+			cached_id = cached_memory.id;
+		}
 
 		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
 		                                       program(),
@@ -213,14 +259,15 @@ public:
 
 class OpenCLSplitKernel : public DeviceSplitKernel {
 	OpenCLDeviceSplitKernel *device;
+	CachedSplitMemory cached_memory;
 public:
 	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
 	}
 
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
 	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
 
 		bool single_program = OpenCLInfo::use_single_program();
 		kernel->program =
@@ -349,6 +396,15 @@ public:
 			return false;
 		}
 
+		cached_memory.split_data = &split_data;
+		cached_memory.ray_state = &ray_state;
+		cached_memory.rng_state = &rtile.rng_state;
+		cached_memory.queue_index = &queue_index;
+		cached_memory.use_queues_flag = &use_queues_flag;
+		cached_memory.work_pools = &work_pool_wgs;
+		cached_memory.buffer = &rtile.buffer;
+		cached_memory.id++;
+
 		return true;
 	}
 
@@ -368,12 +424,18 @@ public:
 
 		cl_ulong max_buffer_size;
 		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+		if(DebugFlags().opencl.mem_limit) {
+			max_buffer_size = min(max_buffer_size,
+			                      cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
+		}
+
 		VLOG(1) << "Maximum device allocation size: "
 		        << string_human_readable_number(max_buffer_size) << " bytes. ("
 		        << string_human_readable_size(max_buffer_size) << ").";
 
 		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
-		int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+		int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
 		VLOG(1) << "Global size: " << global_size << ".";
 		return global_size;
 	}
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 642c1bfa11c..0d34af3e040 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -241,9 +241,9 @@ string OpenCLCache::get_kernel_md5()
 }
 
 OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device,
-                                               string program_name,
-                                               string kernel_file,
-                                               string kernel_build_options,
+                                               const string& program_name,
+                                               const string& kernel_file,
+                                               const string& kernel_build_options,
                                                bool use_stdout)
  : device(device),
    program_name(program_name),
@@ -274,7 +274,7 @@ void OpenCLDeviceBase::OpenCLProgram::release()
 	}
 }
 
-void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
+void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug)
 {
 	if(!use_stdout) {
 		log += msg + "\n";
@@ -288,7 +288,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
 	}
 }
 
-void OpenCLDeviceBase::OpenCLProgram::add_error(string msg)
+void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg)
 {
 	if(use_stdout) {
 		fprintf(stderr, "%s\n", msg.c_str());
@@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 	if(!get_device_name(device_id, &device_name)) {
 		return false;
 	}
+
+	int driver_major = 0;
+	int driver_minor = 0;
+	if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
+		return false;
+	}
+	VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
 	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
 	 * (aka, it will not be on Intel framework). This isn't supported
 	 * and needs an explicit blacklist.
@@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
+		if(driver_major < 2236) {
+			VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+			return false;
+		}
+		const char *blacklist[] = {
+			/* GCN 1 */
+			"Tahiti", "Pitcairn", "Capeverde", "Oland",
+			NULL
+		};
+		for (int i = 0; blacklist[i] != NULL; i++) {
+			if(device_name == blacklist[i]) {
+				VLOG(1) << "AMD device " << device_name << " not supported";
+				return false;
+			}
+		}
 		return true;
 	}
 	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
@@ -684,7 +707,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device,
 	return true;
 }
 
-string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id)
+string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id)
 {
 	if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
 		/* Use cl_amd_device_topology extension. */
@@ -1063,7 +1086,7 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	                   CL_DEVICE_BOARD_NAME_AMD,
 	                   sizeof(board_name),
 	                   &board_name,
-					   &length) == CL_SUCCESS)
+	                   &length) == CL_SUCCESS)
 	{
 		if(length != 0 && board_name[0] != '\0') {
 			return board_name;
@@ -1073,6 +1096,34 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	return get_device_name(device_id);
 }
 
+bool OpenCLInfo::get_driver_version(cl_device_id device_id,
+                                    int *major,
+                                    int *minor,
+                                    cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DRIVER_VERSION,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	if(sscanf(buffer, "%d.%d", major, minor) < 2) {
+		VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+		return false;
+	}
+	return true;
+}
+
 int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
 {
 	int base_align_bits;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index bef869f34b4..23e9bd311c4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -45,6 +45,7 @@ set(SRC
 	kernels/opencl/kernel_direct_lighting.cl
 	kernels/opencl/kernel_shadow_blocked_ao.cl
 	kernels/opencl/kernel_shadow_blocked_dl.cl
+	kernels/opencl/kernel_enqueue_inactive.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
 	kernels/opencl/kernel_indirect_subsurface.cl
 	kernels/opencl/kernel_buffer_update.cl
@@ -121,6 +122,10 @@ set(SRC_KERNELS_CUDA_HEADERS
 	kernels/cuda/kernel_config.h
 )
 
+set(SRC_KERNELS_OPENCL_HEADERS
+	kernels/opencl/kernel_split_function.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -278,6 +283,7 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
 	split/kernel_do_volume.h
+	split/kernel_enqueue_inactive.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
 	split/kernel_indirect_background.h
 	split/kernel_indirect_subsurface.h
@@ -450,6 +456,7 @@ add_library(cycles_kernel
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
 	${SRC_KERNELS_CUDA_HEADERS}
+	${SRC_KERNELS_OPENCL_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_FILTER_HEADERS}
@@ -490,9 +497,11 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_sc
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index a04c157dc40..86a00d2124d 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -423,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 			return bsdf_hair_merge(a, b);
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			return bsdf_principled_diffuse_merge(a, b);
+#endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
 			return volume_henyey_greenstein_merge(a, b);
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 30cc8b90330..b12e248f0a3 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -288,12 +288,16 @@ ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
 	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
 	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
 	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
 
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
 
@@ -302,12 +306,16 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
 	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
 	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
 	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
 
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
 
@@ -343,12 +351,16 @@ ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
 	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
 	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
 	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
 
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = saturate(bsdf->alpha_y);
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index b07b515c405..22d0092093a 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -245,35 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r)
 	return saturate(albedo);
 }
 
+ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior)
+{
+	if(ior < 1.0f) {
+		ior = 1.0f/ior;
+	}
+	a = saturate(a);
+	ior = clamp(ior, 1.0f, 3.0f);
+	float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f;
+	float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f;
+	float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior);
+	float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f;
+
+	return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f);
+}
+
 ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+	float multiscatter = wo.z * M_1_PI_F;
+
 	float albedo = mf_ggx_albedo(alpha);
-	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
-	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
+	float D = D_ggx_aniso(normalize(wi+wo), alpha);
+	float lambda = mf_lambda(wi, alpha);
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+	float multiscatter = wo.z * M_1_PI_F;
+
+	float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y));
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
-	float3 wh;
-	float fresnel;
-	if(wi.z*wo.z > 0.0f) {
-		wh = normalize(wi + wo);
-		fresnel = fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
-	else {
-		wh = normalize(wi + wo*eta);
-		fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
+	bool reflective = (wi.z*wo.z > 0.0f);
+
+	float wh_len;
+	float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len);
 	if(wh.z < 0.0f)
 		wh = -wh;
 	float3 r_wi = (wi.z < 0.0f)? -wi: wi;
-	return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z);
+	float lambda = mf_lambda(r_wi, make_float2(alpha, alpha));
+	float D = D_ggx(wh, alpha);
+	float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta);
+
+	float multiscatter = fabsf(wo.z * M_1_PI_F);
+	if(reflective) {
+		float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f);
+		float albedo = mf_ggx_albedo(alpha);
+		return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
+	else {
+		float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f));
+		float albedo = mf_ggx_transmission_albedo(alpha, eta);
+		return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
 }
 
 /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
@@ -326,13 +360,17 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
 	if(is_zero(bsdf->T))
 		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
 
 	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
 
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -345,12 +383,16 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
 	bsdf->alpha_y = bsdf->alpha_x;
 
 	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
 
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -455,7 +497,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
 	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
 	bsdf->alpha_y = bsdf->alpha_x;
@@ -469,6 +511,10 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsd
 
 	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
 
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 215c32e1ffb..f8ca64293b0 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -58,6 +58,14 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a;
+	const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
+}
+
 ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I,
 	const float3 omega_in, float *pdf)
 {
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index 53d703de143..6226ed2c2ef 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -78,16 +78,10 @@ ccl_device_inline void filter_calculate_scale(float *scale)
 	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
 }
 
-ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer,
-                                                int pass_stride)
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+                                          int pass_stride)
 {
-	return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
-}
-
-ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer,
-                                                  int pass_stride)
-{
-	return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
+	return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
 }
 
 ccl_device_inline void design_row_add(float *design_row,
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 5cb4038bc33..3e752bce68f 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -101,7 +101,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
 		for(int x = rect.x; x < rect.z; x++) {
 			const int low = max(rect.x, x-f);
 			const int high = min(rect.z, x+f+1);
-			out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+			out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
 		}
 	}
 }
@@ -133,8 +133,6 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
 ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            const float *ccl_restrict difference_image,
                                                            const float *ccl_restrict buffer,
-                                                           float *color_pass,
-                                                           float *variance_pass,
                                                            float *transform,
                                                            int *rank,
                                                            float *XtWX,
@@ -167,7 +165,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
 			                                dx, dy, w, h,
 			                                pass_stride,
 			                                buffer,
-			                                color_pass, variance_pass,
 			                                l_transform, l_rank,
 			                                weight, l_XtWX, l_XtWY, 0);
 		}
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 078c5f56763..2c5ac807051 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -66,7 +66,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
 		sum += difference_image[y*w+x1];
 	}
 	sum *= 1.0f/(high-low);
-	out_image[y*w+x] = expf(-max(sum, 0.0f));
+	out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
 }
 
 ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
@@ -97,8 +97,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
                                                            int dx, int dy,
                                                            const ccl_global float *ccl_restrict difference_image,
                                                            const ccl_global float *ccl_restrict buffer,
-                                                           ccl_global float *color_pass,
-                                                           ccl_global float *variance_pass,
                                                            const ccl_global float *ccl_restrict transform,
                                                            ccl_global int *rank,
                                                            ccl_global float *XtWX,
@@ -130,7 +128,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
 	                                dx, dy, w, h,
 	                                pass_stride,
 	                                buffer,
-	                                color_pass, variance_pass,
 	                                transform, rank,
 	                                weight, XtWX, XtWY,
 	                                localIdx);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 82cc36625ec..d5ae1b73927 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -142,13 +142,22 @@ ccl_device void kernel_filter_detect_outliers(int x, int y,
 	float ref = 2.0f*values[(int)(n*0.75f)];
 	float fac = 1.0f;
 	if(L > ref) {
-		/* If the pixel is an outlier, negate the depth value to mark it as one.
-		 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
-		depth[idx] = -depth[idx];
-		fac = ref/L;
-		variance[idx              ] *= fac*fac;
-		variance[idx + pass_stride] *= fac*fac;
-		variance[idx+2*pass_stride] *= fac*fac;
+		/* The pixel appears to be an outlier.
+		 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+		 * should actually be at the reference value:
+		 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+		 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+		 */
+		float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+		if(L - 3*stddev < ref) {
+			/* The pixel is an outlier, so negate the depth value to mark it as one.
+			 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+			depth[idx] = -depth[idx];
+			fac = ref/L;
+			variance[idx              ] *= fac*fac;
+			variance[idx + pass_stride] *= fac*fac;
+			variance[idx+2*pass_stride] *= fac*fac;
+		}
 	}
 	out[idx              ] = fac*image[idx];
 	out[idx + pass_stride] = fac*image[idx + pass_stride];
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 4a4c81b7ba3..25a3025056c 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -22,8 +22,6 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int w, int h,
                                                        int pass_stride,
                                                        const ccl_global float *ccl_restrict buffer,
-                                                       ccl_global float *color_pass,
-                                                       ccl_global float *variance_pass,
                                                        const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
                                                        float weight,
@@ -31,38 +29,31 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        ccl_global float3 *XtWY,
                                                        int localIdx)
 {
+	if(weight < 1e-3f) {
+		return;
+	}
+
 	int p_offset =  y    *w +  x;
 	int q_offset = (y+dy)*w + (x+dx);
 
-#ifdef __KERNEL_CPU__
-	const int stride = 1;
-	(void)storage_stride;
-	(void)localIdx;
-	float design_row[DENOISE_FEATURES+1];
-#elif defined(__KERNEL_CUDA__)
+#ifdef __KERNEL_GPU__
 	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
 	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
 	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
 #else
-	const int stride = storage_stride;
 	float design_row[DENOISE_FEATURES+1];
 #endif
 
-	float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
-	float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+	float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
 
-	float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
-	float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
-
-	/* If the pixel was flagged as an outlier during prefiltering, skip it.
-	 * Otherwise, perform the regular confidence interval test unless
-	 * the center pixel is an outlier (in that case, using the confidence
-	 * interval test could result in no pixels being used at all). */
-	bool p_outlier = (ccl_get_feature(buffer + p_offset, 0) < 0.0f);
-	bool q_outlier = (ccl_get_feature(buffer + q_offset, 0) < 0.0f);
-	bool outside_of_interval = (average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f));
-
-	if(q_outlier || (!p_outlier && outside_of_interval)) {
+	/* If the pixel was flagged as an outlier during prefiltering, skip it. */
+	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
 		return;
 	}
 
@@ -83,13 +74,19 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
                                               int4 buffer_params,
                                               int sample)
 {
-#ifdef __KERNEL_CPU__
-	const int stride = 1;
-	(void)storage_stride;
-#else
+#ifdef __KERNEL_GPU__
 	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
 #endif
 
+	if(XtWX[0] < 1e-3f) {
+		/* There is not enough information to determine a denoised result.
+		 * As a fallback, keep the original value of the pixel. */
+		 return;
+	}
+
 	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
 	 * In case the solution of the linear model fails due to numerical issues,
 	 * fall back to this value. */
@@ -102,6 +99,9 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
 		final_color = mean_color;
 	}
 
+	/* Clamp pixel value to positive values. */
+	final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
 	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
 	final_color *= sample;
 	if(buffer_params.w) {
@@ -114,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
 	combined_buffer[2] = final_color.z;
 }
 
-#undef STORAGE_TYPE
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 06728415c15..175bd6b9737 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -621,25 +621,43 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 {
 	float fac = 1.0f/num_samples;
 
+#ifdef __SPLIT_KERNEL__
+#  define safe_float3_add(f, v) \
+	do { \
+		ccl_global float *p = (ccl_global float*)(&(f)); \
+		atomic_add_and_fetch_float(p+0, (v).x); \
+		atomic_add_and_fetch_float(p+1, (v).y); \
+		atomic_add_and_fetch_float(p+2, (v).z); \
+	} while(0)
+#else
+#  define safe_float3_add(f, v) (f) += (v)
+#endif  /* __SPLIT_KERNEL__ */
+
 #ifdef __PASSES__
-	L->direct_diffuse += L_sample->direct_diffuse*fac;
-	L->direct_glossy += L_sample->direct_glossy*fac;
-	L->direct_transmission += L_sample->direct_transmission*fac;
-	L->direct_subsurface += L_sample->direct_subsurface*fac;
-	L->direct_scatter += L_sample->direct_scatter*fac;
-
-	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
-	L->indirect_glossy += L_sample->indirect_glossy*fac;
-	L->indirect_transmission += L_sample->indirect_transmission*fac;
-	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
-	L->indirect_scatter += L_sample->indirect_scatter*fac;
-
-	L->background += L_sample->background*fac;
-	L->ao += L_sample->ao*fac;
-	L->shadow += L_sample->shadow*fac;
+	safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
+	safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
+	safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
+	safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
+	safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
+
+	safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
+	safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
+	safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
+	safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
+	safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
+
+	safe_float3_add(L->background, L_sample->background*fac);
+	safe_float3_add(L->ao, L_sample->ao*fac);
+	safe_float3_add(L->shadow, L_sample->shadow*fac);
+#  ifdef __SPLIT_KERNEL__
+	atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
+#  else
 	L->mist += L_sample->mist*fac;
-#endif
-	L->emission += L_sample->emission * fac;
+#  endif  /* __SPLIT_KERNEL__ */
+#endif  /* __PASSES__ */
+	safe_float3_add(L->emission, L_sample->emission*fac);
+
+#undef safe_float3_add
 }
 
 #ifdef __SHADOW_TRICKS__
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 9d52834ffcc..9cd7ffb181d 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -142,7 +142,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob
 
 ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
                                                         ShaderData *sd,
-                                                        ccl_global PathState *state,
+                                                        ccl_addr_space PathState *state,
                                                         PathRadiance *L)
 {
 #ifdef __DENOISING_FEATURES__
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 0fa77d9e8bd..5d92fd12201 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -139,9 +139,11 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 	/* random number generator next bounce */
 	state->rng_offset += PRNG_BOUNCE_NUM;
 
+#ifdef __DENOISING_FEATURES__
 	if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
 		state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
 	}
+#endif
 }
 
 ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 96bc636d5ac..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index(
 	return my_gqidx;
 }
 
+ccl_device int dequeue_ray_index(
+        int queue_number,
+        ccl_global int *queues,
+        int queue_size,
+        ccl_global int *queue_index)
+{
+	int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+	if(index < 0) {
+		return QUEUE_EMPTY_SLOT;
+	}
+
+	return queues[index + queue_number * queue_size];
+}
+
 CCL_NAMESPACE_END
 
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 1026cde7b29..6475d4b66fd 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -418,7 +418,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 }
 
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index dbeaffdfb24..34affab1b9d 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -135,7 +135,7 @@ CCL_NAMESPACE_BEGIN
  * this is because megakernel in device_opencl does not support
  * custom cflags depending on the scene features.
  */
-#  endif  /* __KERNEL_OPENCL_NVIDIA__ */
+#  endif  /* __KERNEL_OPENCL_APPLE__ */
 
 #  ifdef __KERNEL_OPENCL_AMD__
 #    define __CL_USE_NATIVE__
@@ -236,6 +236,9 @@ CCL_NAMESPACE_BEGIN
 #ifdef __NO_PRINCIPLED__
 #  undef __PRINCIPLED__
 #endif
+#ifdef __NO_DENOISING__
+#  undef __DENOISING_FEATURES__
+#endif
 
 /* Random Numbers */
 
@@ -1387,6 +1390,8 @@ enum QueueNumber {
 #ifdef __BRANCHED_PATH__
 	/* All rays moving to next iteration of the indirect loop for light */
 	QUEUE_LIGHT_INDIRECT_ITER,
+	/* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+	QUEUE_INACTIVE_RAYS,
 #  ifdef __VOLUME__
 	/* All rays moving to next iteration of the indirect loop for volumes */
 	QUEUE_VOLUME_INDIRECT_ITER,
@@ -1429,6 +1434,9 @@ enum RayState {
 	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
 	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
 	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+	/* Ray is evaluating an iteration of an indirect loop for another thread */
+	RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 9c0878249d4..1e472aaf51a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -660,6 +660,7 @@ typedef struct VolumeSegment {
  * but the entire segment is needed to do always scattering, rather than probabilistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
  * volume we can't generate stratified distance samples up to that transmittance */
+#ifdef __VOLUME_DECOUPLED__
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
@@ -829,6 +830,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
 #endif
 	}
 }
+#endif  /* __VOLUME_DECOUPLED__ */
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
  * marching.
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index ffd34c293fc..2ed713299fd 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -107,8 +107,6 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
                                                              float *difference_image,
                                                              float *buffer,
-                                                             float *color_pass,
-                                                             float *variance_pass,
                                                              float *transform,
                                                              int *rank,
                                                              float *XtWX,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 261176846b1..8dc1a8d583c 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -213,8 +213,6 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
                                                              float *difference_image,
                                                              float *buffer,
-                                                             float *color_pass,
-                                                             float *variance_pass,
                                                              float *transform,
                                                              int *rank,
                                                              float *XtWX,
@@ -229,7 +227,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 9895080d328..c8938534fe8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -85,6 +85,7 @@ DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9b85a864153..d4315ee5ec4 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -53,6 +53,7 @@
 #    include "kernel/split/kernel_direct_lighting.h"
 #    include "kernel/split/kernel_shadow_blocked_ao.h"
 #    include "kernel/split/kernel_shadow_blocked_dl.h"
+#    include "kernel/split/kernel_enqueue_inactive.h"
 #    include "kernel/split/kernel_next_iteration_setup.h"
 #    include "kernel/split/kernel_indirect_subsurface.h"
 #    include "kernel/split/kernel_buffer_update.h"
@@ -230,6 +231,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 2edbff08087..009c3fde9d5 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -207,8 +207,6 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
                                          const float *ccl_restrict difference_image,
                                          const float *ccl_restrict buffer,
-                                         float *color_pass,
-                                         float *variance_pass,
                                          float const* __restrict__ transform,
                                          int *rank,
                                          float *XtWX,
@@ -225,7 +223,6 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
 		                                    dx, dy,
 		                                    difference_image,
 		                                    buffer,
-		                                    color_pass, variance_pass,
 		                                    transform, rank,
 		                                    XtWX, XtWY,
 		                                    rect, filter_rect,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 8b7f1a8d405..628891b1458 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -39,6 +39,7 @@
 #include "kernel/split/kernel_direct_lighting.h"
 #include "kernel/split/kernel_shadow_blocked_ao.h"
 #include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
 #include "kernel/split/kernel_next_iteration_setup.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 #include "kernel/split/kernel_buffer_update.h"
@@ -118,6 +119,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 0462ca6f9bc..ba53ba4b26f 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -207,8 +207,6 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
                                                       int dy,
                                                       const ccl_global float *ccl_restrict difference_image,
                                                       const ccl_global float *ccl_restrict buffer,
-                                                      ccl_global float *color_pass,
-                                                      ccl_global float *variance_pass,
                                                       const ccl_global float *ccl_restrict transform,
                                                       ccl_global int *rank,
                                                       ccl_global float *XtWX,
@@ -227,7 +225,6 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
 		                                    dx, dy,
 		                                    difference_image,
 		                                    buffer,
-		                                    color_pass, variance_pass,
 		                                    transform, rank,
 		                                    XtWX, XtWY,
 		                                    rect, filter_rect,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
index db65c91baf7..dcea2630aef 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_buffer_update.h"
 
-__kernel void kernel_ocl_path_trace_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index eb34f750881..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_direct_lighting.h"
 
-__kernel void kernel_ocl_path_trace_direct_lighting(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
index 83ef5f5f3f2..8afaa686e28 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_do_volume.h"
 
-__kernel void kernel_ocl_path_trace_do_volume(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_do_volume((KernelGlobals*)kg);
-}
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index d071b39aa6f..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -18,12 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local BackgroundAOLocals locals;
-	kernel_holdout_emission_blurring_pathtermination_ao(
-	        (KernelGlobals*)kg,
-	        &locals);
-}
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
index 8c213ff5cb2..192d01444ba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_indirect_background.h"
 
-__kernel void kernel_ocl_path_trace_indirect_background(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_indirect_background((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
index 998ebc4c0c3..84938b889e5 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 
-__kernel void kernel_ocl_path_trace_indirect_subsurface(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_indirect_subsurface((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 822d2287715..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_lamp_emission.h"
 
-__kernel void kernel_ocl_path_trace_lamp_emission(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_lamp_emission((KernelGlobals*)kg);
-}
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d207253a40..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_next_iteration_setup.h"
 
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
index bd9aa9538c8..fa210e747c0 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_path_init.h"
 
-__kernel void kernel_ocl_path_trace_path_init(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_path_init((KernelGlobals*)kg);
-}
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 9be154e3d75..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_queue_enqueue.h"
 
-__kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local QueueEnqueueLocals locals;
-	kernel_queue_enqueue((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index eb4fb4d153a..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_scene_intersect.h"
 
-__kernel void kernel_ocl_path_trace_scene_intersect(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_scene_intersect((KernelGlobals*)kg);
-}
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 5bfb31b193a..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shader_eval.h"
 
-__kernel void kernel_ocl_path_trace_shader_eval(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_shader_eval((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
index 38bfd04ad4c..8c36100f762 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shader_setup.h"
 
-__kernel void kernel_ocl_path_trace_shader_setup(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local unsigned int local_queue_atomics;
-	kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
index 6f722915d45..bcacaa4a054 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -19,10 +19,9 @@
 #include "kernel/split/kernel_shader_sort.h"
 
 __attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void kernel_ocl_path_trace_shader_sort(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	ccl_local ShaderSortLocals locals;
-	kernel_shader_sort((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
index 6a8ef81b32a..8de250a375c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shadow_blocked_ao.h"
 
-__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_shadow_blocked_ao((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
index b255cc5ef8b..29da77022ed 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_shadow_blocked_dl.h"
 
-__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_shadow_blocked_dl((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 8de82db7afe..651addb02f4 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -31,6 +31,7 @@
 #include "kernel/kernels/opencl/kernel_direct_lighting.cl"
 #include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
 #include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
 #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
 #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
 #include "kernel/kernels/opencl/kernel_buffer_update.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..f1e914a70d4
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+		ccl_global char *kg_global,
+		ccl_constant KernelData *data,
+
+		ccl_global void *split_data_buffer,
+		ccl_global char *ray_state,
+		ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+		ccl_global type *name,
+#include "kernel/kernel_textures.h"
+
+		ccl_global int *queue_index,
+		ccl_global char *use_queues_flag,
+		ccl_global unsigned int *work_pools,
+		ccl_global float *buffer
+	)
+{
+#ifdef LOCALS_TYPE
+	ccl_local LOCALS_TYPE locals;
+#endif
+
+	KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+	if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+		kg->data = data;
+
+		kernel_split_params.rng_state = rng_state;
+		kernel_split_params.queue_index = queue_index;
+		kernel_split_params.use_queues_flag = use_queues_flag;
+		kernel_split_params.work_pools = work_pools;
+		kernel_split_params.buffer = buffer;
+
+		split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+		kg->name = name;
+#include "kernel/kernel_textures.h"
+	}
+
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+			kg
+#ifdef LOCALS_TYPE
+			, &locals
+#endif
+		);
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
index 99b74a1802b..2b3be38df84 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_subsurface_scatter.h"
 
-__kernel void kernel_ocl_path_trace_subsurface_scatter(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_subsurface_scatter((KernelGlobals*)kg);
-}
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 188c3960a5f..27a96720c1e 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -191,7 +191,7 @@ class PrincipledBSSRDFClosure : public CBSSRDFClosure {
 public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
-		alloc(sd, path_flag, weight * albedo, CLOSURE_BSSRDF_PRINCIPLED_ID);
+		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
 	}
 };
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 5b66793a05d..14c5c1c3db5 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -156,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -164,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -191,7 +191,7 @@ BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
 class PrincipledClearcoatClosure : public CBSDFClosure {
 public:
 	MicrofacetBsdf params;
-	float clearcoat, clearcoat_gloss;
+	float clearcoat, clearcoat_roughness;
 
 	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
 	{
@@ -202,8 +202,8 @@ public:
 
 			bsdf->ior = 1.5f;
 
-			bsdf->alpha_x = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss;
-			bsdf->alpha_y = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss;
+			bsdf->alpha_x = clearcoat_roughness;
+			bsdf->alpha_y = clearcoat_roughness;
 
 			bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
 			bsdf->extra->clearcoat = clearcoat;
@@ -217,7 +217,7 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf) : 0;
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0;
 	}
 };
 
@@ -226,7 +226,7 @@ ClosureParam *closure_bsdf_principled_clearcoat_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
 		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
-		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_gloss),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
 		CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
 		CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)
 	};
@@ -389,7 +389,7 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf) : 0;
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0;
 	}
 };
 
@@ -413,7 +413,7 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf) : 0;
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
 	}
 };
 
@@ -566,7 +566,7 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf) : 0;
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0;
 	}
 };
 
@@ -590,7 +590,7 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf) : 0;
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
 	}
 };
 
@@ -618,7 +618,7 @@ public:
 	void setup(ShaderData *sd, int path_flag, float3 weight)
 	{
 		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf) : 0;
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0;
 	}
 };
 
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 57f40789d49..2bb981c3918 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -32,7 +32,7 @@ shader node_principled_bsdf(
 	float Sheen = 0.0,
 	float SheenTint = 0.5,
 	float Clearcoat = 0.0,
-	float ClearcoatGloss = 1.0,
+	float ClearcoatRoughness = 0.03,
 	float IOR = 1.45,
 	float Transmission = 0.0,
 	float TransmissionRoughness = 0.0,
@@ -57,8 +57,8 @@ shader node_principled_bsdf(
 
 	if (diffuse_weight > 1e-5) {
 		if (Subsurface > 1e-5) {
-			color Albedo = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
-			BSDF = bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, Albedo, Roughness);
+			color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
+			BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness);
 		} else {
 			BSDF = BaseColor * principled_diffuse(Normal, Roughness);
 		}
@@ -114,7 +114,7 @@ shader node_principled_bsdf(
 	}
 
 	if (Clearcoat > 1e-5) {
-		BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatGloss);
+		BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness);
 	}
 }
 
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 289d1091b0a..c91d2918687 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -546,7 +546,7 @@ closure color holdout() BUILTIN;
 closure color ambient_occlusion() BUILTIN;
 closure color principled_diffuse(normal N, float roughness) BUILTIN;
 closure color principled_sheen(normal N) BUILTIN;
-closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_gloss) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
 
 // BSSRDF
 closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index dc74a2ada53..e2762a85fc8 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -63,12 +63,49 @@ ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobal
 	REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
 }
 
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+		kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+	if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+		return false;
+	}
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+		kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+	SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+	kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+	kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+	kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+	path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+	inactive_L->direct_throughput = L->direct_throughput;
+	path_radiance_copy_indirect(inactive_L, L);
+
+	ray_state[inactive_ray] = RAY_REGENERATED;
+	ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+	ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+	atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+	return true;
+}
+
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
                                                                                 int ray_index,
                                                                                 float num_samples_adjust,
                                                                                 ShaderData *saved_sd,
-                                                                                bool reset_path_state)
+                                                                                bool reset_path_state,
+                                                                                bool wait_for_shared)
 {
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
@@ -155,12 +192,25 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 			/* start the indirect path */
 			*tp *= num_samples_inv;
 
+			if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+				continue;
+			}
+
 			return true;
 		}
 
 		branched_state->next_sample = 0;
 	}
 
+	branched_state->next_closure = sd->num_closure;
+
+	if(wait_for_shared) {
+		branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+		if(branched_state->waiting_on_shared_samples) {
+			return true;
+		}
+	}
+
 	return false;
 }
 
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 694b777f429..9f8dd2392d9 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -75,11 +75,30 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 			branched_state->next_sample = j+1;
 			branched_state->num_samples = num_samples;
 
+			/* Attempting to share too many samples is slow for volumes as it causes us to
+			 * loop here more and have many calls to kernel_volume_integrate which evaluates
+			 * shaders. The many expensive shader evaluations cause the work load to become
+			 * unbalanced and many threads to become idle in this kernel. Limiting the
+			 * number of shared samples here helps quite a lot.
+			 */
+			if(branched_state->shared_sample_count < 2) {
+				if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+					continue;
+				}
+			}
+
 			return true;
 		}
 #  endif
 	}
 
+	branched_state->next_sample = num_samples;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
 	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
 
 	/* todo: avoid this calculation using decoupled ray marching */
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+                                        ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+	/* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	char enqueue_flag = 0;
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+		enqueue_flag = 1;
+	}
+
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_INACTIVE_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif  /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 71017fed19e..7758e35fd32 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -147,6 +147,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 			                                                          ray_index,
 			                                                          1.0f,
 			                                                          &kernel_split_state.branched_state[ray_index].sd,
+			                                                          true,
 			                                                          true))
 			{
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
@@ -193,6 +194,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 		                                                          ray_index,
 		                                                          1.0f,
 		                                                          &kernel_split_state.branched_state[ray_index].sd,
+		                                                          true,
 		                                                          true))
 		{
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
index e2e841f36d3..66ce2dfb6f1 100644
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
 	int queue_number = -1;
 
 	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
 		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
 	}
 	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 5dc94caec85..45984ca509b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 	}
 
 	/* All regenerated rays become active here */
-	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+		if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+			kernel_split_path_end(kg, ray_index);
+		}
+		else
+#endif  /* __BRANCHED_PATH__ */
+		{
+			ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+		}
+	}
 
-	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
 		return;
+	}
 
 #ifdef __KERNEL_DEBUG__
 	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 386fbbc4d09..78e61709b01 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -29,6 +29,14 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
 	}
 
+#ifdef __BRANCHED_PATH__
+	/* TODO(mai): move this somewhere else? */
+	if(thread_index == 0) {
+		/* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+		kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+	}
+#endif  /* __BRANCHED_PATH__ */
+
 	if(ray_index == QUEUE_EMPTY_SLOT)
 		return;
 
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 57f070d51e0..08f0124b529 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -56,7 +56,20 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 
 #ifdef __BRANCHED_PATH__
-	if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+	if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+		int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_accum_sample(orig_ray_L, L, 1);
+
+		atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
 	}
 	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index bb1aca2acbf..4bb2f0d3d80 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -95,6 +95,10 @@ typedef ccl_global struct SplitBranchedState {
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #  endif  /* __VOLUME__ */
 #endif  /*__SUBSURFACE__ */
+
+	int shared_sample_count; /* number of branched samples shared with other threads */
+	int original_ray; /* index of original ray when sharing branched samples */
+	bool waiting_on_shared_samples;
 } SplitBranchedState;
 
 #define SPLIT_DATA_BRANCHED_ENTRIES \
@@ -137,6 +141,25 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_BRANCHED_ENTRIES \
 	SPLIT_DATA_DEBUG_ENTRIES \
 
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_DEBUG_ENTRIES \
+
 /* struct that holds pointers to data in the shared state buffer */
 typedef struct SplitData {
 #define SPLIT_DATA_ENTRY(type, name, num) type *name;
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 1dffe1b179e..d5083b23f80 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -169,6 +169,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 				                                                          ray_index,
 				                                                          num_samples_inv,
 				                                                          bssrdf_sd,
+				                                                          false,
 				                                                          false))
 				{
 					branched_state->ss_next_closure = i;
@@ -187,6 +188,13 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 		branched_state->ss_next_sample = 0;
 	}
 
+	branched_state->ss_next_closure = sd->num_closure;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
 	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
 
 	return false;
@@ -257,21 +265,20 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 				/* do bssrdf scatter step if we picked a bssrdf closure */
 				if(sc) {
 					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
-
 					float bssrdf_u, bssrdf_v;
 					path_state_rng_2D(kg,
-						              &rng,
-						              state,
-						              PRNG_BSDF_U,
-						              &bssrdf_u, &bssrdf_v);
+					                  &rng,
+					                  state,
+					                  PRNG_BSDF_U,
+					                  &bssrdf_u, &bssrdf_v);
 					subsurface_scatter_step(kg,
-						                    sd,
-						                    state,
-						                    state->flag,
-						                    sc,
-						                    &lcg_state,
-						                    bssrdf_u, bssrdf_v,
-						                    false);
+					                        sd,
+					                        state,
+					                        state->flag,
+					                        sc,
+					                        &lcg_state,
+					                        bssrdf_u, bssrdf_v,
+					                        false);
 				}
 			}
 			else {
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index f04f765686e..9578fcf2687 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -79,13 +79,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __PRINCIPLED__
 		case CLOSURE_BSDF_PRINCIPLED_ID: {
 			uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
-				sheen_tint_offset, clearcoat_offset, clearcoat_gloss_offset, eta_offset, transmission_offset,
+				sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset,
 				anisotropic_rotation_offset, transmission_roughness_offset;
 			uint4 data_node2 = read_node(kg, offset);
 
 			float3 T = stack_load_float3(stack, data_node.y);
 			decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
-			decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_gloss_offset);
+			decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset);
 			decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
 
 			// get Disney principled parameters
@@ -98,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float sheen = stack_load_float(stack, sheen_offset);
 			float sheen_tint = stack_load_float(stack, sheen_tint_offset);
 			float clearcoat = stack_load_float(stack, clearcoat_offset);
-			float clearcoat_gloss = stack_load_float(stack, clearcoat_gloss_offset);
+			float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset);
 			float transmission = stack_load_float(stack, transmission_offset);
 			float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
 			float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
@@ -141,8 +141,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float3 weight = sd->svm_closure_weight * mix_weight;
 
 #ifdef __SUBSURFACE__
-			float3 albedo = subsurface_color * subsurface + base_color * (1.0f - subsurface);
-			float3 subsurf_weight = weight * albedo * diffuse_weight;
+			float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface);
+			float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
 			float subsurf_sample_weight = fabsf(average(subsurf_weight));
 
 			/* disable in case of diffuse ancestor, can't see it well then and
@@ -154,11 +154,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				/* need to set the base color in this case such that the
 				 * rays get the correctly mixed color after transmitting
 				 * the object */
-				base_color = albedo;
+				base_color = mixed_ss_base_color;
 			}
 
 			/* diffuse */
-			if(fabsf(average(base_color)) > CLOSURE_WEIGHT_CUTOFF) {
+			if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
 				if(subsurface < CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
 					float3 diff_weight = weight * base_color * diffuse_weight;
 
@@ -186,7 +186,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->sample_weight = subsurf_sample_weight;
 						bssrdf->radius = radius.x;
 						bssrdf->texture_blur = texture_blur;
-						bssrdf->albedo = albedo.x;
+						bssrdf->albedo = subsurface_color.x;
 						bssrdf->sharpness = sharpness;
 						bssrdf->N = N;
 						bssrdf->roughness = roughness;
@@ -200,7 +200,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->sample_weight = subsurf_sample_weight;
 						bssrdf->radius = radius.y;
 						bssrdf->texture_blur = texture_blur;
-						bssrdf->albedo = albedo.y;
+						bssrdf->albedo = subsurface_color.y;
 						bssrdf->sharpness = sharpness;
 						bssrdf->N = N;
 						bssrdf->roughness = roughness;
@@ -214,7 +214,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->sample_weight = subsurf_sample_weight;
 						bssrdf->radius = radius.z;
 						bssrdf->texture_blur = texture_blur;
-						bssrdf->albedo = albedo.z;
+						bssrdf->albedo = subsurface_color.z;
 						bssrdf->sharpness = sharpness;
 						bssrdf->N = N;
 						bssrdf->roughness = roughness;
@@ -292,9 +292,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 						/* setup bsdf */
 						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
-							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
 						else /* use multi-scatter GGX */
-							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
 					}
 				}
 #ifdef __CAUSTICS_TRICKS__
@@ -332,7 +332,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 								bsdf->extra->cspec0 = cspec0;
 
 								/* setup bsdf */
-								sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf);
+								sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
 							}
 						}
 
@@ -377,7 +377,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 							bsdf->extra->cspec0 = cspec0;
 
 							/* setup bsdf */
-							sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
 						}
 					}
 				}
@@ -398,14 +398,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bsdf->ior = 1.5f;
 						bsdf->extra = extra;
 
-						bsdf->alpha_x = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss;
-						bsdf->alpha_y = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss;
+						bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+						bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
 
 						bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
 						bsdf->extra->clearcoat = clearcoat;
 
 						/* setup bsdf */
-						sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
 					}
 				}
 #ifdef __CAUSTICS_TRICKS__
@@ -725,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					bsdf->roughness1 = param1;
 					bsdf->roughness2 = param2;
 					bsdf->offset = -stack_load_float(stack, data_node.z);
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index 2569d9eec27..943b218f0e4 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -160,6 +160,14 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp)
 		bypass(input->link);
 		return true;
 	}
+	else {
+		/* disconnect other inputs if we can't fully bypass due to clamp */
+		foreach(ShaderInput *other, node->inputs) {
+			if(other != input && other->link) {
+				graph->disconnect(other);
+			}
+		}
+	}
 
 	return false;
 }
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 625dd3ded39..93d88c5642c 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -224,6 +224,10 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 
 bool LightManager::object_usable_as_light(Object *object) {
 	Mesh *mesh = object->mesh;
+	/* Skip objects with NaNs */
+	if (!object->bounds.valid()) {
+		return false;
+	}
 	/* Skip if we are not visible for BSDFs. */
 	if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
 		return false;
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 57b475e5cd0..166156f7ac3 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2308,13 +2308,13 @@ NODE_DEFINE(PrincipledBsdfNode)
 	SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
 	SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
 	SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
-	SOCKET_IN_FLOAT(roughness, "Roughness", 0.0f);
+	SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
 	SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
 	SOCKET_IN_FLOAT(anisotropic, "Anisotropic", 0.0f);
 	SOCKET_IN_FLOAT(sheen, "Sheen", 0.0f);
 	SOCKET_IN_FLOAT(sheen_tint, "Sheen Tint", 0.0f);
 	SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f);
-	SOCKET_IN_FLOAT(clearcoat_gloss, "Clearcoat Gloss", 0.0f);
+	SOCKET_IN_FLOAT(clearcoat_roughness, "Clearcoat Roughness", 0.03f);
 	SOCKET_IN_FLOAT(ior, "IOR", 0.0f);
 	SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f);
 	SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f);
@@ -2351,7 +2351,7 @@ void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attribu
 
 void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius,
 	ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic,
-	ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_gloss,
+	ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_roughness,
 	ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness)
 {
 	ShaderInput *base_color_in = input("Base Color");
@@ -2374,7 +2374,7 @@ void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic,
 	int sheen_offset = compiler.stack_assign(p_sheen);
 	int sheen_tint_offset = compiler.stack_assign(p_sheen_tint);
 	int clearcoat_offset = compiler.stack_assign(p_clearcoat);
-	int clearcoat_gloss_offset = compiler.stack_assign(p_clearcoat_gloss);
+	int clearcoat_roughness_offset = compiler.stack_assign(p_clearcoat_roughness);
 	int ior_offset = compiler.stack_assign(p_ior);
 	int transmission_offset = compiler.stack_assign(p_transmission);
 	int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
@@ -2391,7 +2391,7 @@ void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic,
 
 	compiler.add_node(normal_offset, tangent_offset,
 		compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset),
-		compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_gloss_offset));
+		compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset));
 
 	compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset),
 		distribution, SVM_STACK_INVALID, SVM_STACK_INVALID);
@@ -2419,7 +2419,7 @@ void PrincipledBsdfNode::compile(SVMCompiler& compiler)
 {
 	compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"),
 		input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"),
-		input("Clearcoat"), input("Clearcoat Gloss"), input("IOR"), input("Transmission"),
+		input("Clearcoat"), input("Clearcoat Roughness"), input("IOR"), input("Transmission"),
 		input("Anisotropic Rotation"), input("Transmission Roughness"));
 }
 
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index aac6ce2f375..c6ab47fcc84 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -252,6 +252,7 @@ public:
 class PointDensityTextureNode : public ShaderNode {
 public:
 	SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
+	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
 	~PointDensityTextureNode();
 	ShaderNode *clone() const;
@@ -377,13 +378,13 @@ public:
 	bool has_bssrdf_bump();
 	void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius,
 		ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic,
-		ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_gloss,
+		ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_roughness,
 		ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness);
 
 	float3 base_color;
 	float3 subsurface_color, subsurface_radius;
 	float metallic, subsurface, specular, roughness, specular_tint, anisotropic,
-		sheen, sheen_tint, clearcoat, clearcoat_gloss, ior, transmission,
+		sheen, sheen_tint, clearcoat, clearcoat_roughness, ior, transmission,
 		anisotropic_rotation, transmission_roughness;
 	float3 normal, clearcoat_normal, tangent;
 	float surface_mix_weight;
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 6bff29d1c76..a794f233718 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -156,6 +156,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 	og->surface_state.clear();
 	og->volume_state.clear();
 	og->displacement_state.clear();
+	og->bump_state.clear();
 	og->background_state.reset();
 }
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 08909943c49..ae462a1084a 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -722,6 +722,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
 	requested_features.use_transparent &= scene->integrator->transparent_shadows;
+	requested_features.use_denoising = params.use_denoising;
 
 	return requested_features;
 }
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 22ec8e0ee8e..6c059ba5d12 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -245,4 +245,41 @@ TEST(util_string_remove_trademark, both)
 	EXPECT_EQ(str, "foo bar zzz");
 }
 
+TEST(util_string_remove_trademark, both_space)
+{
+	string str = string_remove_trademark("foo bar(TM) (R) zzz");
+	EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, both_space_around)
+{
+	string str = string_remove_trademark("foo bar (TM) (R) zzz");
+	EXPECT_EQ(str, "foo bar zzz");
+}
+
+TEST(util_string_remove_trademark, trademark_space_suffix)
+{
+	string str = string_remove_trademark("foo bar (TM)");
+	EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, trademark_space_middle)
+{
+	string str = string_remove_trademark("foo bar (TM) baz");
+	EXPECT_EQ(str, "foo bar baz");
+}
+
+
+TEST(util_string_remove_trademark, r_space_suffix)
+{
+	string str = string_remove_trademark("foo bar (R)");
+	EXPECT_EQ(str, "foo bar");
+}
+
+TEST(util_string_remove_trademark, r_space_middle)
+{
+	string str = string_remove_trademark("foo bar (R) baz");
+	EXPECT_EQ(str, "foo bar baz");
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 6c52117ef9a..643af87a65f 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -35,6 +35,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
 
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
 
 #define CCL_LOCAL_MEM_FENCE 0
 #define ccl_barrier(flags) (void)0
@@ -68,6 +69,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 
 #define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+#define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
 
 #define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
 #define ccl_barrier(flags) barrier(flags)
@@ -79,7 +81,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 #define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
 
 #define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x))
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
 
 #define CCL_LOCAL_MEM_FENCE
 #define ccl_barrier(flags) __syncthreads()
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index ab038d2b9fb..10895f2e918 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -184,8 +184,8 @@ std::ostream& operator <<(std::ostream &os,
 	   << "  Device type    : " << opencl_device_type << "\n"
 	   << "  Kernel type    : " << opencl_kernel_type << "\n"
 	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
-	   << "  Single program : " << string_from_bool(debug_flags.opencl.single_program)
-	   << "\n";
+	   << "  Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
+	   << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
 	return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 4505d584490..450cd900a9f 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -115,6 +115,10 @@ public:
 
 		/* Use single program */
 		bool single_program;
+
+		/* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
+		/* Artificial memory limit in bytes (0 if disabled). */
+		size_t mem_limit;
 	};
 
 	/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index ecf9c9cfee0..492f830e67c 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -19,28 +19,30 @@
 
 #if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__)
 #  include <glog/logging.h>
-#else
-#  include <iostream>
 #endif
 
+#include <iostream>
+
 CCL_NAMESPACE_BEGIN
 
 #if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__)
-class StubStream : public std::ostream {
- public:
-	StubStream() : std::ostream(NULL) { }
+class StubStream {
+public:
+	template<class T>
+	StubStream& operator<<(const T&) {
+		return *this;
+	}
 };
 
 class LogMessageVoidify {
 public:
 	LogMessageVoidify() { }
-	void operator&(::std::ostream&) { }
+	void operator&(StubStream&) { }
 };
 
 #  define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
-
 #endif
 
 #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 5327d9f7cc6..bb04c4aa2d9 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -374,9 +374,9 @@ ccl_device_inline bool isfinite3_safe(float3 v)
 
 ccl_device_inline float3 ensure_finite3(float3 v)
 {
-	if(!isfinite_safe(v.x)) v.x = 0.0;
-	if(!isfinite_safe(v.y)) v.y = 0.0;
-	if(!isfinite_safe(v.z)) v.z = 0.0;
+	if(!isfinite_safe(v.x)) v.x = 0.0f;
+	if(!isfinite_safe(v.y)) v.y = 0.0f;
+	if(!isfinite_safe(v.z)) v.z = 0.0f;
 	return v;
 }
 
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index bc672669e1f..cd4fe52fdc9 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -226,6 +226,7 @@ public:
 
 	int get_current_sample()
 	{
+		thread_scoped_lock lock(progress_mutex);
 		/* Note that the value here always belongs to the last tile that updated,
 		 * so it's only useful if there is only one active tile. */
 		return current_tile_sample;
@@ -233,11 +234,13 @@ public:
 
 	int get_rendered_tiles()
 	{
+		thread_scoped_lock lock(progress_mutex);
 		return rendered_tiles;
 	}
 
 	int get_denoised_tiles()
 	{
+		thread_scoped_lock lock(progress_mutex);
 		return denoised_tiles;
 	}
 
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index a1008d510d1..94ad512982c 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other)
 string string_remove_trademark(const string &s)
 {
 	string result = s;
+
+	/* Special case, so we don;t leave sequential spaces behind. */
+	/* TODO(sergey): Consider using regex perhaps? */
+	string_replace(result, " (TM)", "");
+	string_replace(result, " (R)", "");
+
 	string_replace(result, "(TM)", "");
 	string_replace(result, "(R)", "");
 
diff --git a/intern/elbeem/intern/isosurface.cpp b/intern/elbeem/intern/isosurface.cpp
index fb61fb416b4..de7bfe8e687 100644
--- a/intern/elbeem/intern/isosurface.cpp
+++ b/intern/elbeem/intern/isosurface.cpp
@@ -15,6 +15,7 @@
 #include "particletracer.h"
 #include <algorithm>
 #include <stdio.h>
+#include <cmath>
 
 #ifdef sun
 #include "ieeefp.h"
@@ -25,6 +26,8 @@
 #define round(x) (x)
 #endif
 
+using std::isfinite;
+
 /******************************************************************************
  * Constructor
  *****************************************************************************/
@@ -937,17 +940,10 @@ void IsoSurface::smoothSurface(float sigma, bool normSmooth)
 											 ew[(j+2)%3]);
 			}
 
-			// NT important, check this...
-#ifndef WIN32
-			if(! finite(cornerareas[i][0]) ) cornerareas[i][0]=1e-6;
-			if(! finite(cornerareas[i][1]) ) cornerareas[i][1]=1e-6;
-			if(! finite(cornerareas[i][2]) ) cornerareas[i][2]=1e-6;
-#else // WIN32
-			// FIXME check as well...
-			if(! (cornerareas[i][0]>=0.0) ) cornerareas[i][0]=1e-6;
-			if(! (cornerareas[i][1]>=0.0) ) cornerareas[i][1]=1e-6;
-			if(! (cornerareas[i][2]>=0.0) ) cornerareas[i][2]=1e-6;
-#endif // WIN32
+			// FIX T50887: ensure pointareas are finite
+			if (!isfinite(cornerareas[i][0])) cornerareas[i][0] = 1e-6;
+			if (!isfinite(cornerareas[i][1])) cornerareas[i][1] = 1e-6;
+			if (!isfinite(cornerareas[i][2])) cornerareas[i][2] = 1e-6;
 
 			pointareas[mIndices[i*3+0]] += cornerareas[i][0];
 			pointareas[mIndices[i*3+1]] += cornerareas[i][1];
@@ -1096,17 +1092,10 @@ void IsoSurface::smoothNormals(float sigma) {
 											 ew[(j+2)%3]);
 			}
 
-			// NT important, check this...
-#ifndef WIN32
-			if(! finite(cornerareas[i][0]) ) cornerareas[i][0]=1e-6;
-			if(! finite(cornerareas[i][1]) ) cornerareas[i][1]=1e-6;
-			if(! finite(cornerareas[i][2]) ) cornerareas[i][2]=1e-6;
-#else // WIN32
-			// FIXME check as well...
-			if(! (cornerareas[i][0]>=0.0) ) cornerareas[i][0]=1e-6;
-			if(! (cornerareas[i][1]>=0.0) ) cornerareas[i][1]=1e-6;
-			if(! (cornerareas[i][2]>=0.0) ) cornerareas[i][2]=1e-6;
-#endif // WIN32
+			// FIX T50887: ensure pointareas are finite
+			if (!isfinite(cornerareas[i][0])) cornerareas[i][0] = 1e-6;
+			if (!isfinite(cornerareas[i][1])) cornerareas[i][1] = 1e-6;
+			if (!isfinite(cornerareas[i][2])) cornerareas[i][2] = 1e-6;
 
 			pointareas[mIndices[i*3+0]] += cornerareas[i][0];
 			pointareas[mIndices[i*3+1]] += cornerareas[i][1];
diff --git a/intern/elbeem/intern/solver_util.cpp b/intern/elbeem/intern/solver_util.cpp
index 6eca427c787..f0c7bce2b4e 100644
--- a/intern/elbeem/intern/solver_util.cpp
+++ b/intern/elbeem/intern/solver_util.cpp
@@ -855,6 +855,10 @@ void LbmFsgrSolver::advanceParticles() {
 				if(k<=mSizez-1-cutval){ 
 					CellFlagType pflag = RFLAG(level, i,j,k, workSet);
 					//errMsg("PIT move"," at "<<PRINT_IJK<<" flag"<<convertCellFlagType2String(pflag) );
+					if (pflag & CFMbndOutflow) {
+						DEL_PART;
+						continue;
+					}
 					if(pflag & (CFBnd)) {
 						handleObstacleParticle(p);
 						continue;
diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h
index 6887063eae9..967d3f58143 100644
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@@ -43,7 +43,7 @@ extern "C" {
  * Creates a "handle" for a C++ GHOST object.
  * A handle is just an opaque pointer to an empty struct.
  * In the API the pointer is casted to the actual C++ class.
- * \param name Name of the handle to create.
+ * The 'name' argument to the macro is the name of the handle to create.
  */
 
 GHOST_DECLARE_HANDLE(GHOST_SystemHandle);
diff --git a/intern/ghost/intern/GHOST_SystemWin32.cpp b/intern/ghost/intern/GHOST_SystemWin32.cpp
index 9f03b5e9537..b0dae432643 100644
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -941,6 +941,8 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
 	GHOST_ASSERT(system, "GHOST_SystemWin32::s_wndProc(): system not initialized");
 
 	if (hwnd) {
+#if 0
+		// Disabled due to bug in Intel drivers, see T51959
 		if(msg == WM_NCCREATE) {
 			// Tell Windows to automatically handle scaling of non-client areas
 			// such as the caption bar. EnableNonClientDpiScaling was introduced in Windows 10
@@ -954,6 +956,7 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
 				}
 			}
 		}
+#endif
 
 		GHOST_WindowWin32 *window = (GHOST_WindowWin32 *)::GetWindowLongPtr(hwnd, GWLP_USERDATA);
 		if (window) {
diff --git a/intern/ghost/intern/GHOST_WindowCocoa.h b/intern/ghost/intern/GHOST_WindowCocoa.h
index b234291396b..5168c48ca2f 100644
--- a/intern/ghost/intern/GHOST_WindowCocoa.h
+++ b/intern/ghost/intern/GHOST_WindowCocoa.h
@@ -56,7 +56,7 @@ public:
 	 * \param systemCocoa		The associated system class to forward events to
 	 * \param title				The text shown in the title bar of the window.
 	 * \param left				The coordinate of the left edge of the window.
-	 * \param top				The coordinate of the top edge of the window.
+	 * \param bottom			The coordinate of the bottom edge of the window.
 	 * \param width				The width the window.
 	 * \param height			The height the window.
 	 * \param state				The state the window is initially opened with.
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index ce8a5b29ece..b4838cdca18 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -64,9 +64,9 @@ enum {
 	MEMHEAD_ALIGN_FLAG = 2,
 };
 
-#define MEMHEAD_FROM_PTR(ptr) (((MemHead*) vmemh) - 1)
+#define MEMHEAD_FROM_PTR(ptr) (((MemHead*) ptr) - 1)
 #define PTR_FROM_MEMHEAD(memhead) (memhead + 1)
-#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) vmemh) - 1)
+#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) ptr) - 1)
 #define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) MEMHEAD_MMAP_FLAG)
 #define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t) MEMHEAD_ALIGN_FLAG)
 
diff --git a/intern/libmv/ChangeLog b/intern/libmv/ChangeLog
index 45be9c25afa..81096dd90c9 100644
--- a/intern/libmv/ChangeLog
+++ b/intern/libmv/ChangeLog
@@ -1,3 +1,156 @@
+commit efd7a93317e0278b99e66785f667823e451daef1
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Tue May 9 10:16:42 2017 +0200
+
+    Fix strict compiler warnings, unused variables
+
+commit 8efd47e13dfdd3f7209bc96f26d0b13127dd6376
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed Dec 14 10:44:57 2016 +0100
+
+    Fix T50243: libmv_panography_test is broken
+    
+    There was fully wrong logic in comparison: was actually accessing memory
+    past the array boundary. Run test manually and the figure seems correct
+    to me now.
+    
+    Spotted by @LazyDodo, thanks!
+
+commit 6dfb9cd1bd14669d84be789000ce234747fb00ff
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Thu Jul 14 11:49:38 2016 +0200
+
+    Fix some strict compiler warnings
+    
+    One of them was a real bug!
+
+commit f61adaecf7b29ebe6677be0e1c825f0a8d475e4b
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed May 31 11:22:34 2017 +0200
+
+    Enable explicit schur complement for BA step
+    
+    This is something we do in Blender and only reason it was not
+    enabled for standalone Libmv is because we did not have fresh
+    enough version of Ceres bundled.
+
+commit fc5d3a1d4880c6658aff693c1c1e8c10c96ce1a7
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed Nov 2 15:32:11 2016 +0100
+
+    Update tests to make tests pass after recent Ceres update
+    
+    Just a precision issue, difference is around 1e-7. Should be fine to
+    simply update expected value.
+
+commit e1ac9f6124110c1a90d8e417bea47acfcbdcca42
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed May 31 10:54:48 2017 +0200
+
+    Update Ceres to latest release 1.12.0
+
+commit ac1571352b4962f110929b963f8616d7310ceea5
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Fri Apr 7 17:10:44 2017 +0200
+
+    Fix crash of keyframe selection on 32bit linux
+
+commit 5f8df3da965686df39a6ae5c9f17482075017bf4
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Tue Jan 19 14:00:53 2016 +0500
+
+    Solve some strict warnings in tests
+
+commit 8ea3a5d752a9ce3337ab7643897472a4d33747f1
+Author: Brecht Van Lommel <brechtvanlommel@gmail.com>
+Date:   Sat Feb 18 23:52:31 2017 +0100
+
+    Fix a few compiler warnings with macOS / clang.
+
+commit ffbe81461770e70736e80b8cab8e6eb1f8b27160
+Author: Mike Erwin <significant.bit@gmail.com>
+Date:   Wed May 31 10:43:08 2017 +0200
+
+    Fix comparison of identicals
+    
+    Some of these check that dimensions match before running code that
+    assumes they do match.
+    
+    Found with PVS-Studio T48917.
+
+commit 206c01999cde16c1c6c43a8e13ffa86020821d98
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed May 31 10:39:16 2017 +0200
+
+    Add basic track masking API in place
+    
+    This brings back ability to mask non-interesting parts of
+    specific track (the feature got lost with new auto-track API).
+    
+    Added it back by extending frame accessor class. This isn't really
+    a frame thing, but we don't have other type of accessor here.
+    
+    Surely, we can use old-style API here and pass mask via region
+    tracker options for this particular case, but then it becomes much
+    less obvious how real auto-tracker will access this mask with old
+    style API.
+    
+    So seems we do need an accessor for such data, just matter of
+    finding better place than frame accessor.
+
+commit faa069cb826892780356477cc10602390fecf06b
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed May 31 10:36:26 2017 +0200
+
+    Tests: Tweak epsilon to avoid what looks a false-positive failure
+
+commit 7c84e45c1d330871477ba3516f57178e5b9d101f
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed May 31 10:15:43 2017 +0200
+
+    CMake: Fix mistake in closing branch
+
+commit cb769a0d319a8c95948153d78a4c3378a0142ece
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Thu Jul 21 12:52:33 2016 +0200
+
+    Set of fixes for MSVC215
+    
+    - Move GLOG/GFLAGS defines to a more global scope,
+      this way ANY of our own libraries will use proper
+      declspec.
+    
+    - Compile png/zlib/openexif on Windows as well since
+      those are required for a correct linking.
+
+commit bb95c8654fd2cea72d66ed04cd825cc3712ea804
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed Jul 20 18:14:46 2016 +0200
+
+    Disable unexisting Ceres option
+    
+    Explicit Schur complement requires having
+    newer Ceres than we currently have bundled.
+
+commit a2e12c959ef32cc9382244d1581992c2f7aa9c09
+Author: Sergey Sharybin <sergey.vfx@gmail.com>
+Date:   Wed Jul 20 18:04:57 2016 +0200
+
+    Various fixes for MSVC
+    
+    - Update Eigen to 3.2.7 since this brings crucial
+      fixes for MSVC 2015.
+    
+    - Switch to STATIC build by default.
+    
+      There are issues building current sources as dynamic
+      libraries with MSVC2015 and additionally building
+      dynamic Ceres is not recommended anyway, so let's
+      not do this for the time being.
+    
+    If anyone finds a way to make this all working --
+    it'llsurely be a welcome addition.
+
 commit 7a676106720fb126a27ff010abdd8bb65d7e0d9a
 Author: Sergey Sharybin <sergey.vfx@gmail.com>
 Date:   Mon Jan 4 18:30:12 2016 +0500
@@ -365,239 +518,3 @@ Date:   Thu May 8 15:50:26 2014 +0200
     Reviewed By: sergey
     
     Differential Revision: https://developer.blender.org/D516
-
-commit 4405dff60ea08d454b64da1a7c0595d9328cf8a3
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Thu May 8 15:38:14 2014 +0200
-
-    Add public SetMarkers to AutoTrack
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D515
-
-commit c90837f6db276a3b1f610eaad509155f6a43b24f
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Thu May 8 15:17:48 2014 +0200
-
-    Make autotrack skeleton compile
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D514
-
-commit be01baa2e82e36f63e548f073157e68d2ff870c0
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Wed May 7 18:48:55 2014 +0200
-
-    Add preliminary TrackMarkerToFrame in autotrack
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D509
-
-commit 0cab028d591b3d08672ca86eb6c6e4ac1aacf1d0
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Wed May 7 17:59:11 2014 +0200
-
-    Remove assert from ArrayND Resize
-    
-    That assert broke initialization of arrays which doesn't
-    own the data since constructor uses Resize to set shape
-    and strides.
-    
-    Strides are still to be fixed, but that's for later.
-
-commit 64f9c118029a9351e9023e96527c120e1d724d5b
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Wed May 7 17:42:21 2014 +0200
-
-    Fix ArrayND freeing the data it doesn't own
-    
-    Can't really guarantee it works fully correct now,
-    but at least this check is needed anyway and compilation
-    works just fine.
-    
-    Reviewers: keir
-    
-    Reviewed By: keir
-    
-    Differential Revision: https://developer.blender.org/D508
-
-commit 0618f1c8e88dfc738cdde55784da80b889905e7c
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Wed May 7 12:03:32 2014 +0200
-
-    Minor changes
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D505
-
-commit 5c34335e1bb90c4ed701ee830c718ed4e20dbffa
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Wed May 7 11:12:23 2014 +0200
-
-    Fix compilation error in frame accessor
-    
-    - int64 is not a standard type, we've got int64_t defined in
-      std int. We also have an msvc port of this header, so should
-      not be an issue.
-    
-    - Fixed inconsistency in usage of CacheKey and Key, used Key.
-    
-    - Some functions weren't marked as virtual.
-    
-    Additional change: added self to authors.
-    
-    Reviewers: keir
-    
-    Reviewed By: keir
-    
-    Differential Revision: https://developer.blender.org/D504
-
-commit 06bc207614e262cd688e2c3ed820ade7c77bdb66
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Tue May 6 22:30:59 2014 +0200
-
-    Start new Tracks implementation
-    
-    This adds the new Tracks implementation, as well as a
-    trivial test to show it compiles.
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D502
-
-commit 25ce061e6da69881460ba7718bb0d660a2380a02
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Tue May 6 19:10:51 2014 +0200
-
-    Add Reconstruction class for new API
-    
-    This starts the new Reconstruction class (with support for e.g. planes). This
-    also starts the new namespace "mv" which will eventually have all the symbols
-    we wish to export.
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D501
-
-commit 0a6af3e29016048978aea607673340500e050339
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Tue May 6 17:52:53 2014 +0200
-
-    Add a new Tracks implementation
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D500
-
-commit 887b68d29c2b198f4939f9ab5153881aa2c1806e
-Author: Keir Mierle <mierle@gmail.com>
-Date:   Tue May 6 17:01:39 2014 +0200
-
-    Initial commit of unfinished AutoTrack API
-    
-    This starts the creating the new AutoTrack API. The new API will
-    make it possible for libmv to do full autotracking, including
-    predictive tracking and also support multiple motion models (3D
-    planes etc).
-    
-    The first goal (not in this patch) is to convert Blender to use
-    the new API without adding any new functionality.
-    
-    Note: This does not add any of the API to the build system!
-    It likely does not compile.
-    
-    Reviewers: sergey
-    
-    Reviewed By: sergey
-    
-    Differential Revision: https://developer.blender.org/D499
-
-commit 08cc227d431d257d27f300fbb8e6991e663302da
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Tue May 6 13:09:22 2014 +0200
-
-    Fix homography test failure
-    
-    It was caused by assuming that reconstructed homography matrix
-    should look exactly the same as the matrix used to generate a
-    test case.
-    
-    It's not actually valid assumption because different-looking
-    matrices could correspond to the same exact transform.
-    
-    In this change we make it so actual "re-projected" vectors
-    are being checked, not the values in matrix. This makes it
-    more predictable verification.
-    
-    Reviewers: keir
-    
-    Reviewed By: keir
-    
-    Differential Revision: https://developer.blender.org/D488
-
-commit 0b7d83dc9627447dc7df64d7e3a468aefe9ddc13
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Wed Apr 23 19:14:55 2014 +0600
-
-    Fix compilation on OSX after previous commit
-    
-    EXPECT_EQ wasn't defined in the scope.
-
-commit d14049e00dabf8fdf49056779f0a3718fbb39e8f
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Wed Apr 23 15:08:16 2014 +0600
-
-    Move aligned malloc implementation into own file
-    
-    It was rather stupid having it in brute region tracker,
-    now it is in own file in base library (which was also
-    added in this commit, before this it consist of header
-    files only).
-    
-    Reviewers: keir
-    
-    Reviewed By: keir
-    
-    Differential Revision: https://developer.blender.org/D479
-
-commit 0ddf3851bfcb8de43660b119a25a77a25674200d
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Mon Apr 21 14:14:03 2014 +0600
-
-    Optimization of PearsonProductMomentCorrelation
-    
-    Pass the arrays by reference rather than by value,
-    should give some percent of speedup.
-    
-    Also don't pass the dimensions to the function but
-    get them from the images themselves.
-    
-    Hopefully this will give some %% of tracker speedup.
-
-commit f68fdbe5896a6c5bd8b500caeec61b876c5e44c6
-Author: Sergey Sharybin <sergey.vfx@gmail.com>
-Date:   Mon Apr 21 14:10:43 2014 +0600
-
-    Fix wrong assert in ResizeImage()
-    
-    The assert didn't make any sense because ComputeBoundingBox()
-    is intended to return bounding box in the following way:
-    (xmin, xmax, ymin, ymax).
diff --git a/intern/libmv/bundle.sh b/intern/libmv/bundle.sh
index 27e012f665f..d155d050782 100755
--- a/intern/libmv/bundle.sh
+++ b/intern/libmv/bundle.sh
@@ -120,6 +120,7 @@ if(WITH_LIBMV)
 	add_definitions(\${GFLAGS_DEFINES})
 	add_definitions(\${GLOG_DEFINES})
 	add_definitions(\${CERES_DEFINES})
+	add_definitions(-DLIBMV_GFLAGS_NAMESPACE=\${GFLAGS_NAMESPACE})
 
 	list(APPEND INC
 		\${GFLAGS_INCLUDE_DIRS}
diff --git a/intern/libmv/intern/track_region.cc b/intern/libmv/intern/track_region.cc
index d395b6457d7..8989897e09f 100644
--- a/intern/libmv/intern/track_region.cc
+++ b/intern/libmv/intern/track_region.cc
@@ -36,7 +36,7 @@
 
 /* define this to generate PNG images with content of search areas
    on every itteration of tracking */
-#define DUMP_ALWAYS
+#undef DUMP_ALWAYS
 
 using libmv::FloatImage;
 using libmv::TrackRegionOptions;
diff --git a/intern/memutil/MEM_CacheLimiterC-Api.h b/intern/memutil/MEM_CacheLimiterC-Api.h
index 0fe5469a4d4..b5680890eb8 100644
--- a/intern/memutil/MEM_CacheLimiterC-Api.h
+++ b/intern/memutil/MEM_CacheLimiterC-Api.h
@@ -61,8 +61,8 @@ bool MEM_CacheLimiter_is_disabled(void);
  * Create new MEM_CacheLimiter object
  * managed objects are destructed with the data_destructor
  *
- * @param data_destructor
- * @return A new MEM_CacheLimter object
+ * \param data_destructor
+ * \return A new MEM_CacheLimter object
  */
 
 MEM_CacheLimiterC *new_MEM_CacheLimiter(MEM_CacheLimiter_Destruct_Func data_destructor,
@@ -73,7 +73,7 @@ MEM_CacheLimiterC *new_MEM_CacheLimiter(MEM_CacheLimiter_Destruct_Func data_dest
  *
  * Frees the memory of the CacheLimiter but does not touch managed objects!
  *
- * @param This "This" pointer
+ * \param This "This" pointer
  */
 
 void delete_MEM_CacheLimiter(MEM_CacheLimiterC *This);
@@ -81,8 +81,8 @@ void delete_MEM_CacheLimiter(MEM_CacheLimiterC *This);
 /**
  * Manage object
  *
- * @param This "This" pointer, data data object to manage
- * @return CacheLimiterHandle to ref, unref, touch the managed object
+ * \param This "This" pointer, data data object to manage
+ * \return CacheLimiterHandle to ref, unref, touch the managed object
  */
 
 MEM_CacheLimiterHandleC *MEM_CacheLimiter_insert(MEM_CacheLimiterC *This, void *data);
@@ -90,7 +90,7 @@ MEM_CacheLimiterHandleC *MEM_CacheLimiter_insert(MEM_CacheLimiterC *This, void *
 /**
  * Free objects until memory constraints are satisfied
  *
- * @param This "This" pointer
+ * \param This "This" pointer
  */
 
 void MEM_CacheLimiter_enforce_limits(MEM_CacheLimiterC *This);
@@ -99,7 +99,7 @@ void MEM_CacheLimiter_enforce_limits(MEM_CacheLimiterC *This);
  * Unmanage object previously inserted object.
  * Does _not_ delete managed object!
  *
- * @param This "This" pointer, handle of object
+ * \param handle of object
  */
 
 void MEM_CacheLimiter_unmanage(MEM_CacheLimiterHandleC *handle);
@@ -108,7 +108,7 @@ void MEM_CacheLimiter_unmanage(MEM_CacheLimiterHandleC *handle);
 /**
  * Raise priority of object (put it at the tail of the deletion chain)
  *
- * @param handle of object
+ * \param handle of object
  */
 
 void MEM_CacheLimiter_touch(MEM_CacheLimiterHandleC *handle);
@@ -117,7 +117,7 @@ void MEM_CacheLimiter_touch(MEM_CacheLimiterHandleC *handle);
  * Increment reference counter. Objects with reference counter != 0 are _not_
  * deleted.
  *
- * @param handle of object
+ * \param handle of object
  */
 
 void MEM_CacheLimiter_ref(MEM_CacheLimiterHandleC *handle);
@@ -126,7 +126,7 @@ void MEM_CacheLimiter_ref(MEM_CacheLimiterHandleC *handle);
  * Decrement reference counter. Objects with reference counter != 0 are _not_
  * deleted.
  *
- * @param handle of object
+ * \param handle of object
  */
 
 void MEM_CacheLimiter_unref(MEM_CacheLimiterHandleC *handle);
@@ -134,7 +134,7 @@ void MEM_CacheLimiter_unref(MEM_CacheLimiterHandleC *handle);
 /**
  * Get reference counter.
  *
- * @param handle of object
+ * \param handle of object
  */
 
 int MEM_CacheLimiter_get_refcount(MEM_CacheLimiterHandleC *handle);
@@ -142,7 +142,7 @@ int MEM_CacheLimiter_get_refcount(MEM_CacheLimiterHandleC *handle);
 /**
  * Get pointer to managed object
  *
- * @param handle of object
+ * \param handle of object
  */
 
 void *MEM_CacheLimiter_get(MEM_CacheLimiterHandleC *handle);
diff --git a/intern/opensubdiv/opensubdiv_capi.cc b/intern/opensubdiv/opensubdiv_capi.cc
index 52ce98fe74b..91803551f12 100644
--- a/intern/opensubdiv/opensubdiv_capi.cc
+++ b/intern/opensubdiv/opensubdiv_capi.cc
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <GL/glew.h>
 
+#include <opensubdiv/version.h>
 #include <opensubdiv/osd/glMesh.h>
 
 /* CPU Backend */
@@ -381,3 +382,8 @@ int openSubdiv_supportGPUDisplay(void)
 	        (GLEW_ARB_texture_buffer_object || GLEW_EXT_texture_buffer_object)));
 	/* also ARB_explicit_attrib_location? */
 }
+
+int openSubdiv_getVersionHex(void)
+{
+	return OPENSUBDIV_VERSION_NUMBER;
+}
diff --git a/intern/opensubdiv/opensubdiv_capi.h b/intern/opensubdiv/opensubdiv_capi.h
index c3a194813e6..281bd3f010d 100644
--- a/intern/opensubdiv/opensubdiv_capi.h
+++ b/intern/opensubdiv/opensubdiv_capi.h
@@ -152,6 +152,8 @@ void openSubdiv_init(bool gpu_legacy_support);
 void openSubdiv_cleanup(void);
 bool openSubdiv_gpu_legacy_support(void);
 
+int openSubdiv_getVersionHex(void);
+
 #ifdef __cplusplus
 }
 #endif