286 files changed, 3906 insertions, 1418 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 0147a4306f4..873bbfa36fa 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -3,7 +3,14 @@ if(NOT WITH_BLENDER AND WITH_CYCLES_STANDALONE)
 	set(CYCLES_INSTALL_PATH "")
 else()
 	set(WITH_CYCLES_BLENDER ON)
-	set(CYCLES_INSTALL_PATH "scripts/addons/cycles")
+	# WINDOWS_PYTHON_DEBUG needs to write into the user addons folder since it will
+	# be started with --env-system-scripts pointing to the release folder, which will
+	# lack the cycles addon, and we don't want to write into it.
+	if(NOT WINDOWS_PYTHON_DEBUG)
+		set(CYCLES_INSTALL_PATH "scripts/addons/cycles")
+	else()
+		set(CYCLES_INSTALL_PATH "$ENV{appdata}/blender foundation/blender/${BLENDER_VERSION}/scripts/addons/cycles")
+	endif()
 endif()
 
 # External Libraries
@@ -210,6 +217,15 @@ if(WITH_CYCLES_OSL)
 	)
 endif()
 
+if(WITH_CYCLES_EMBREE)
+	add_definitions(-DWITH_EMBREE)
+	add_definitions(-DEMBREE_STATIC_LIB)
+	include_directories(
+		SYSTEM
+		${EMBREE_INCLUDE_DIRS}
+	)
+endif()
+
 if(WITH_CYCLES_OPENSUBDIV)
 	add_definitions(-DWITH_OPENSUBDIV)
 	include_directories(
@@ -283,12 +299,19 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
 			set(MAX_MSVC 1910)
 		elseif(${CUDA_VERSION} EQUAL "9.1")
 			set(MAX_MSVC 1911)
+		elseif(${CUDA_VERSION} EQUAL "10.0")
+			set(MAX_MSVC 1999)
 		endif()
 		if(NOT MSVC_VERSION LESS ${MAX_MSVC} OR CMAKE_C_COMPILER_ID MATCHES "Clang")
 			message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.")
 			set(WITH_CYCLES_CUBIN_COMPILER ON)
 		endif()
 		unset(MAX_MSVC)
+	elseif(APPLE)
+		if(${XCODE_VERSION} VERSION_GREATER_EQUAL 10.0)
+			message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.")
+			set(WITH_CYCLES_CUBIN_COMPILER ON)
+		endif()
 	endif()
 endif()
 
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 4fd551b33c2..2c1367a86dc 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -77,6 +77,9 @@ macro(cycles_target_link_libraries target)
 	if(WITH_CYCLES_OSL)
 		target_link_libraries(${target} ${OSL_LIBRARIES} ${LLVM_LIBRARIES})
 	endif()
+	if(WITH_CYCLES_EMBREE)
+		target_link_libraries(${target} ${EMBREE_LIBRARIES})
+	endif()
 	if(WITH_CYCLES_OPENSUBDIV)
 		target_link_libraries(${target} ${OPENSUBDIV_LIBRARIES})
 	endif()
@@ -144,6 +147,9 @@ if(WITH_CYCLES_CUBIN_COMPILER)
 		target_link_libraries(cycles_cubin_cc
 			extern_cuew
 			${OPENIMAGEIO_LIBRARIES}
+			${OPENEXR_LIBRARIES}
+			${PUGIXML_LIBRARIES}
+			${BOOST_LIBRARIES}
 			${PLATFORM_LINKLIBS}
 		)
 		if(NOT CYCLES_STANDALONE_REPOSITORY)
diff --git a/intern/cycles/app/cycles_xml.h b/intern/cycles/app/cycles_xml.h
index 6a48980d8ea..a7bc1895d4e 100644
--- a/intern/cycles/app/cycles_xml.h
+++ b/intern/cycles/app/cycles_xml.h
@@ -29,4 +29,4 @@ void xml_read_file(Scene *scene, const char *filepath);
 
 CCL_NAMESPACE_END
 
-#endif /* __CYCLES_XML_H__ */
+#endif  /* __CYCLES_XML_H__ */
diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h
index 233ffc8802c..b9750ad0c53 100644
--- a/intern/cycles/blender/CCL_api.h
+++ b/intern/cycles/blender/CCL_api.h
@@ -33,4 +33,4 @@ void CCL_logging_verbosity_set(int verbosity);
 }
 #endif
 
-#endif /* __CCL_API_H__ */
+#endif  /* __CCL_API_H__ */
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 87dcbe486c7..23239ee4352 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -254,21 +254,32 @@ def register_passes(engine, scene, srl):
     if crl.use_pass_volume_indirect:           engine.register_pass(scene, srl, "VolumeInd",                     3, "RGB", 'COLOR')
 
     cscene = scene.cycles
-    if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine:
-        engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
-        engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
-        engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
-        engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
-        engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE')
-        engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE')
-        engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR')
-        engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR')
-        engine.register_pass(scene, srl, "Denoising Image",           3, "RGB", 'COLOR')
-        engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR')
-
-        clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
-                         "denoising_glossy_direct", "denoising_glossy_indirect",
-                         "denoising_transmission_direct", "denoising_transmission_indirect",
-                         "denoising_subsurface_direct", "denoising_subsurface_indirect")
-        if any(getattr(crl, option) for option in clean_options):
-            engine.register_pass(scene, srl, "Denoising Clean", 3, "RGB", 'COLOR')
+
+    if crl.use_pass_crypto_object:
+        for i in range(0, crl.pass_crypto_depth, 2):
+            engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+    if crl.use_pass_crypto_material:
+        for i in range(0, crl.pass_crypto_depth, 2):
+            engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+    if srl.cycles.use_pass_crypto_asset:
+        for i in range(0, srl.cycles.pass_crypto_depth, 2):
+            engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+
+    if crl.use_denoising or crl.denoising_store_passes:
+        engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR')
+        if crl.denoising_store_passes:
+            engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
+            engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
+            engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
+            engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
+            engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE')
+            engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE')
+            engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR')
+            engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR')
+            engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR')
+            clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
+                             "denoising_glossy_direct", "denoising_glossy_indirect",
+                             "denoising_transmission_direct", "denoising_transmission_indirect",
+                             "denoising_subsurface_direct", "denoising_subsurface_indirect")
+            if any(getattr(crl, option) for option in clean_options):
+                engine.register_pass(scene, srl, "Denoising Clean", 3, "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 80b83c94012..d986ba8c7a8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -547,6 +547,11 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             description="Use special type BVH optimized for hair (uses more ram but renders faster)",
             default=True,
         )
+        cls.use_bvh_embree = BoolProperty(
+            name="Use Embree",
+            description="Use Embree as ray accelerator",
+            default=False,
+        )
         cls.debug_bvh_time_steps = IntProperty(
             name="BVH Time Steps",
             description="Split BVH primitives by this number of time steps to speed up render time in cost of memory",
@@ -1339,7 +1344,36 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
             default=False,
             update=update_render_passes,
         )
-
+        cls.use_pass_crypto_object = BoolProperty(
+                name="Cryptomatte Object",
+                description="Render cryptomatte object pass, for isolating objects in compositing",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.use_pass_crypto_material = BoolProperty(
+                name="Cryptomatte Material",
+                description="Render cryptomatte material pass, for isolating materials in compositing",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.use_pass_crypto_asset = BoolProperty(
+                name="Cryptomatte Asset",
+                description="Render cryptomatte asset pass, for isolating groups of objects with the same parent",
+                default=False,
+                update=update_render_passes,
+                )
+        cls.pass_crypto_depth = IntProperty(
+                name="Cryptomatte Levels",
+                description="Sets how many unique objects can be distinguished per pixel",
+                default=6, min=2, max=16, step=2,
+                update=update_render_passes,
+                )
+        cls.pass_crypto_accurate = BoolProperty(
+                name="Cryptomatte Accurate",
+                description="Gerenate a more accurate Cryptomatte pass. CPU only, may render slower and use more memory",
+                default=True,
+                update=update_render_passes,
+                )
     @classmethod
     def unregister(cls):
         del bpy.types.SceneRenderLayer.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 5edbcb19672..2f1adfe4178 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -17,6 +17,7 @@
 # <pep8 compliant>
 
 import bpy
+import _cycles
 
 from bpy.types import (
     Panel,
@@ -430,11 +431,18 @@ class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel):
         col.separator()
 
         col.label(text="Acceleration structure:")
+        if _cycles.with_embree:
+            row = col.row()
+            row.active = use_cpu(context)
+            row.prop(cscene, "use_bvh_embree")
+        row = col.row()
         col.prop(cscene, "debug_use_spatial_splits")
-        col.prop(cscene, "debug_use_hair_bvh")
+        row = col.row()
+        row.active = not cscene.use_bvh_embree or not _cycles.with_embree
+        row.prop(cscene, "debug_use_hair_bvh")
 
         row = col.row()
-        row.active = not cscene.debug_use_spatial_splits
+        row.active = not cscene.debug_use_spatial_splits and not cscene.use_bvh_embree
         row.prop(cscene, "debug_bvh_time_steps")
 
         col = layout.column()
@@ -491,8 +499,6 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel):
     bl_options = {'DEFAULT_CLOSED'}
 
     def draw(self, context):
-        import _cycles
-
         layout = self.layout
 
         scene = context.scene
@@ -517,6 +523,8 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_shadow")
         col.prop(rl, "use_pass_ambient_occlusion")
         col.separator()
+        col.prop(crl, "denoising_store_passes", text="Denoising Data")
+        col.separator()
         col.prop(rl, "pass_alpha_threshold")
 
         col = split.column()
@@ -549,12 +557,6 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_pass_emit", text="Emission")
         col.prop(rl, "use_pass_environment")
 
-        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
-            col.separator()
-            sub = col.column()
-            sub.active = crl.use_denoising
-            sub.prop(crl, "denoising_store_passes", text="Denoising")
-
         col = layout.column()
         col.prop(crl, "pass_debug_render_time")
         if _cycles.with_cycles_debug:
@@ -563,6 +565,17 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel):
             col.prop(crl, "pass_debug_bvh_intersections")
             col.prop(crl, "pass_debug_ray_bounces")
 
+        crl = rl.cycles
+        layout.label("Cryptomatte:")
+        row = layout.row(align=True)
+        row.prop(crl, "use_pass_crypto_object", text="Object", toggle=True)
+        row.prop(crl, "use_pass_crypto_material", text="Material", toggle=True)
+        row.prop(crl, "use_pass_crypto_asset", text="Asset", toggle=True)
+        row = layout.row(align=True)
+        row.prop(crl, "pass_crypto_depth")
+        row = layout.row(align=True)
+        row.active = use_cpu(context)
+        row.prop(crl, "pass_crypto_accurate", text="Accurate Mode")
 
 class CYCLES_RENDER_PT_views(CyclesButtonsPanel, Panel):
     bl_label = "Views"
@@ -630,9 +643,8 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
         rl = rd.layers.active
         crl = rl.cycles
 
-        layout.active = crl.use_denoising
-
         split = layout.split()
+        split.active = crl.use_denoising
 
         col = split.column()
         sub = col.column(align=True)
@@ -647,24 +659,28 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
         layout.separator()
 
         row = layout.row()
+        row.active = crl.use_denoising or crl.denoising_store_passes
         row.label(text="Diffuse:")
         sub = row.row(align=True)
         sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
         sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
 
         row = layout.row()
+        row.active = crl.use_denoising or crl.denoising_store_passes
         row.label(text="Glossy:")
         sub = row.row(align=True)
         sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
         sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
 
         row = layout.row()
+        row.active = crl.use_denoising or crl.denoising_store_passes
         row.label(text="Transmission:")
         sub = row.row(align=True)
         sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
         sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
 
         row = layout.row()
+        row.active = crl.use_denoising or crl.denoising_store_passes
         row.label(text="Subsurface:")
         sub = row.row(align=True)
         sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 99313866e9e..94d5dc5ea3d 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -707,7 +707,7 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 			if(diff == 0) {
 				for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve]; curvekey++) {
 					if(i < mesh->curve_keys.size()) {
-						mP[i] =CurveSegmentMotionCV(CData, sys, curve, curvekey);
+						mP[i] = CurveSegmentMotionCV(CData, sys, curve, curvekey);
 						if(!have_motion) {
 							/* unlike mesh coordinates, these tend to be slightly different
 							 * between frames due to particle transforms into/out of object
@@ -718,7 +718,6 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 								have_motion = true;
 						}
 					}
-
 					i++;
 				}
 			}
diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp
index d0f82e37662..3fca4efd097 100644
--- a/intern/cycles/blender/blender_logging.cpp
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -22,7 +22,7 @@ void CCL_init_logging(const char *argv0)
 	ccl::util_logging_init(argv0);
 }
 
-void CCL_start_debug_logging(void)
+void CCL_start_debug_logging()
 {
 	ccl::util_logging_start();
 }
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 35bf7beda41..a05c982b367 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -384,6 +384,23 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object_updated = true;
 	}
 
+	/* sync the asset name for Cryptomatte */
+	BL::Object parent = b_ob.parent();
+	ustring parent_name;
+	if(parent) {
+		while(parent.parent()) {
+			parent = parent.parent();
+		}
+		parent_name = parent.name();
+	}
+	else {
+		parent_name = b_ob.name();
+	}
+	if(object->asset_name != parent_name) {
+		object->asset_name = parent_name;
+		object_updated = true;
+	}
+
 	/* object sync
 	 * transform comparison should not be needed, but duplis don't work perfect
 	 * in the depsgraph and may not signal changes, so this is a workaround */
@@ -404,8 +421,8 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 
 			if(scene->need_motion() == Scene::MOTION_BLUR) {
 				motion_steps = object_motion_steps(b_parent, b_ob);
+				mesh->motion_steps = motion_steps;
 				if(motion_steps && object_use_deform_motion(b_parent, b_ob)) {
-					mesh->motion_steps = motion_steps;
 					mesh->use_motion_blur = true;
 				}
 			}
diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h
index 2147877a860..6e2a22438ec 100644
--- a/intern/cycles/blender/blender_object_cull.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -46,4 +46,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __BLENDER_OBJECT_CULL_H__ */
+#endif  /* __BLENDER_OBJECT_CULL_H__ */
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 4b01eb5f2d4..8b3bec56d1f 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -839,10 +839,18 @@ void *CCL_python_module_init()
 #ifdef WITH_NETWORK
 	PyModule_AddObject(mod, "with_network", Py_True);
 	Py_INCREF(Py_True);
-#else /* WITH_NETWORK */
+#else  /* WITH_NETWORK */
 	PyModule_AddObject(mod, "with_network", Py_False);
 	Py_INCREF(Py_False);
-#endif /* WITH_NETWORK */
+#endif  /* WITH_NETWORK */
+
+#ifdef WITH_EMBREE
+	PyModule_AddObject(mod, "with_embree", Py_True);
+	Py_INCREF(Py_True);
+#else  /* WITH_EMBREE */
+	PyModule_AddObject(mod, "with_embree", Py_False);
+	Py_INCREF(Py_False);
+#endif  /* WITH_EMBREE */
 
 	return (void*)mod;
 }
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index a07131d04ae..75c7dcee05e 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -35,6 +35,7 @@
 #include "util/util_function.h"
 #include "util/util_hash.h"
 #include "util/util_logging.h"
+#include "util/util_murmurhash.h"
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
@@ -370,6 +371,17 @@ void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
 		do_write_update_render_tile(rtile, false, false);
 }
 
+static void add_cryptomatte_layer(BL::RenderResult& b_rr, string name, string manifest)
+{
+	string identifier = string_printf("%08x", util_murmur_hash3(name.c_str(), name.length(), 0));
+	string prefix = "cryptomatte/" + identifier.substr(0, 7) + "/";
+
+	render_add_metadata(b_rr, prefix+"name", name);
+	render_add_metadata(b_rr, prefix+"hash", "MurmurHash3_32");
+	render_add_metadata(b_rr, prefix+"conversion", "uint32_to_float32");
+	render_add_metadata(b_rr, prefix+"manifest", manifest);
+}
+
 void BlenderSession::render()
 {
 	/* set callback to write out render results */
@@ -405,17 +417,19 @@ void BlenderSession::render()
 		BL::RenderLayer b_rlay = *b_single_rlay;
 
 		/* add passes */
-		array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
+		vector<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
 		buffer_params.passes = passes;
 
 		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
 		bool use_denoising = get_boolean(crl, "use_denoising");
+		bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes");
 
 		session->tile_manager.schedule_denoising = use_denoising;
-		buffer_params.denoising_data_pass = use_denoising;
+		buffer_params.denoising_data_pass = denoising_passes;
 		buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
 
 		session->params.use_denoising = use_denoising;
+		session->params.denoising_passes = denoising_passes;
 		session->params.denoising_radius = get_int(crl, "denoising_radius");
 		session->params.denoising_strength = get_float(crl, "denoising_strength");
 		session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
@@ -475,15 +489,28 @@ void BlenderSession::render()
 				break;
 		}
 
+		BL::RenderResult b_full_rr = b_engine.get_result();
 		if(is_single_layer) {
-			BL::RenderResult b_rr = b_engine.get_result();
 			string num_aa_samples = string_printf("%d", session->params.samples);
-			b_rr.stamp_data_add_field("Cycles Samples", num_aa_samples.c_str());
+			render_add_metadata(b_full_rr, "Cycles Samples", num_aa_samples);
 			/* TODO(sergey): Report whether we're doing resumable render
 			 * and also start/end sample if so.
 			 */
 		}
 
+		if(scene->film->cryptomatte_passes & CRYPT_OBJECT) {
+			add_cryptomatte_layer(b_full_rr, b_rlay_name+".CryptoObject",
+			                      scene->object_manager->get_cryptomatte_objects(scene));
+		}
+		if(scene->film->cryptomatte_passes & CRYPT_MATERIAL) {
+			add_cryptomatte_layer(b_full_rr, b_rlay_name+".CryptoMaterial",
+			                      scene->shader_manager->get_cryptomatte_materials(scene));
+		}
+		if(scene->film->cryptomatte_passes & CRYPT_ASSET) {
+			add_cryptomatte_layer(b_full_rr, b_rlay_name+".CryptoAsset",
+			                      scene->object_manager->get_cryptomatte_assets(scene));
+		}
+
 		/* free result without merging */
 		end_render_result(b_engine, b_rr, true, true, false);
 
@@ -700,7 +727,7 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 			bool read = false;
 			if(pass_type != PASS_NONE) {
 				/* copy pixels */
-				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
+				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0], b_pass.name());
 			}
 			else {
 				int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
@@ -719,7 +746,7 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 	else {
 		/* copy combined pass */
 		BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
-		if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0]))
+		if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0], "Combined"))
 			b_combined_pass.rect(&pixels[0]);
 	}
 
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 08f5c873bef..b8a9096b354 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -173,4 +173,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BLENDER_SESSION_H__ */
+#endif  /* __BLENDER_SESSION_H__ */
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 3eefb92f6af..e33a6c20a52 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -635,8 +635,8 @@ static ShaderNode *add_node(Scene *scene,
 			}
 		}
 #else
-		(void)b_data;
-		(void)b_ntree;
+		(void) b_data;
+		(void) b_ntree;
 #endif
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeTexImage)) {
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 5e47252e336..832847c179f 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -40,6 +40,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+static const char *cryptomatte_prefix = "Crypto";
+
 /* Constructor */
 
 BlenderSync::BlenderSync(BL::RenderEngine& b_engine,
@@ -517,6 +519,9 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
 	MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
 #endif
 	MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+	if(string_startswith(name, cryptomatte_prefix)) {
+		return PASS_CRYPTOMATTE;
+	}
 #undef MAP_PASS
 
 	return PASS_NONE;
@@ -525,6 +530,9 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
 int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
 {
 	string name = b_pass.name();
+
+	if(name == "Noisy Image") return DENOISING_PASS_COLOR;
+
 	if(name.substr(0, 10) != "Denoising ") {
 		return -1;
 	}
@@ -539,7 +547,6 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
 	MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
 	MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
 	MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
-	MAP_PASS("Image", DENOISING_PASS_COLOR);
 	MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
 	MAP_PASS("Clean", DENOISING_PASS_CLEAN);
 #undef MAP_PASS
@@ -547,11 +554,11 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
 	return -1;
 }
 
-array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
-                                            BL::SceneRenderLayer& b_srlay,
-                                            const SessionParams &session_params)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
+                                             BL::SceneRenderLayer& b_srlay,
+                                             const SessionParams &session_params)
 {
-	array<Pass> passes;
+	vector<Pass> passes;
 	Pass::add(PASS_COMBINED, passes);
 
 	if(!session_params.device.advanced_shading) {
@@ -571,22 +578,11 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 			Pass::add(pass_type, passes);
 	}
 
-	scene->film->denoising_flags = 0;
 	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
-	if(get_boolean(crp, "denoising_store_passes") &&
-	   get_boolean(crp, "use_denoising"))
-	{
-		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Image",           3, "RGB", b_srlay.name().c_str());
-		b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
-
+	bool use_denoising = get_boolean(crp, "use_denoising");
+	bool store_denoising_passes = get_boolean(crp, "denoising_store_passes");
+	scene->film->denoising_flags = 0;
+	if(use_denoising || store_denoising_passes) {
 #define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag;
 		MAP_OPTION("denoising_diffuse_direct",        DENOISING_CLEAN_DIFFUSE_DIR);
 		MAP_OPTION("denoising_diffuse_indirect",      DENOISING_CLEAN_DIFFUSE_IND);
@@ -597,9 +593,22 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 		MAP_OPTION("denoising_subsurface_direct",     DENOISING_CLEAN_SUBSURFACE_DIR);
 		MAP_OPTION("denoising_subsurface_indirect",   DENOISING_CLEAN_SUBSURFACE_IND);
 #undef MAP_OPTION
+		b_engine.add_pass("Noisy Image", 4, "RGBA", b_srlay.name().c_str());
+	}
+
+	if(store_denoising_passes) {
+		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
 
 		if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
-			b_engine.add_pass("Denoising Clean", 3, "RGB", b_srlay.name().c_str());
+			b_engine.add_pass("Denoising Clean",   3, "RGB", b_srlay.name().c_str());
 		}
 	}
 #ifdef __KERNEL_DEBUG__
@@ -633,6 +642,39 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 		Pass::add(PASS_VOLUME_INDIRECT, passes);
 	}
 
+	/* Cryptomatte stores two ID/weight pairs per RGBA layer.
+	 * User facing paramter is the number of pairs. */
+	int crypto_depth = min(16, get_int(crp, "pass_crypto_depth")) / 2;
+	scene->film->cryptomatte_depth = crypto_depth;
+	scene->film->cryptomatte_passes = CRYPT_NONE;
+	if(get_boolean(crp, "use_pass_crypto_object")) {
+		for(int i = 0; i < crypto_depth; ++i) {
+			string passname = cryptomatte_prefix + string_printf("Object%02d", i);
+			b_engine.add_pass(passname.c_str(), 4, "RGBA", b_srlay.name().c_str());
+			Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+		}
+		scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_OBJECT);
+	}
+	if(get_boolean(crp, "use_pass_crypto_material")) {
+		for(int i = 0; i < crypto_depth; ++i) {
+			string passname = cryptomatte_prefix + string_printf("Material%02d", i);
+			b_engine.add_pass(passname.c_str(), 4, "RGBA", b_srlay.name().c_str());
+			Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+		}
+		scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_MATERIAL);
+	}
+	if(get_boolean(crp, "use_pass_crypto_asset")) {
+		for(int i = 0; i < crypto_depth; ++i) {
+			string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
+			b_engine.add_pass(passname.c_str(), 4, "RGBA", b_srlay.name().c_str());
+			Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+		}
+		scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_ASSET);
+	}
+	if(get_boolean(crp, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) {
+		scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_ACCURATE);
+	}
+
 	return passes;
 }
 
@@ -689,6 +731,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 		params.bvh_layout = DebugFlags().cpu.bvh_layout;
 	}
 
+#ifdef WITH_EMBREE
+	params.bvh_layout = RNA_boolean_get(&cscene, "use_bvh_embree") ? BVH_LAYOUT_EMBREE : params.bvh_layout;
+#endif
 	return params;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 5e63f76033d..6d78f62c7d0 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -66,9 +66,9 @@ public:
 	               void **python_thread_state,
 	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
-	array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
-	                               BL::SceneRenderLayer& b_srlay,
-	                               const SessionParams &session_params);
+	vector<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
+	                                BL::SceneRenderLayer& b_srlay,
+	                                const SessionParams &session_params);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings& b_render,
 	                 BL::Object& b_override,
@@ -213,4 +213,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __BLENDER_SYNC_H__ */
+#endif  /* __BLENDER_SYNC_H__ */
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 7e61888348b..eb7019f45bc 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -20,6 +20,7 @@
 #include "render/mesh.h"
 
 #include "util/util_algorithm.h"
+#include "util/util_array.h"
 #include "util/util_map.h"
 #include "util/util_path.h"
 #include "util/util_set.h"
@@ -243,6 +244,12 @@ static inline float *image_get_float_pixels_for_frame(BL::Image& image,
 	return BKE_image_get_float_pixels_for_frame(image.ptr.data, frame);
 }
 
+static inline void render_add_metadata(BL::RenderResult& b_rr, string name, string value)
+{
+	b_rr.stamp_data_add_field(name.c_str(), value.c_str());
+}
+
+
 /* Utilities */
 
 static inline Transform get_transform(const BL::Array<float, 16>& array)
@@ -832,4 +839,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BLENDER_UTIL_H__ */
+#endif  /* __BLENDER_UTIL_H__ */
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index fcd28572fdf..6014624f395 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -13,6 +13,7 @@ set(SRC
 	bvh8.cpp
 	bvh_binning.cpp
 	bvh_build.cpp
+	bvh_embree.cpp
 	bvh_node.cpp
 	bvh_sort.cpp
 	bvh_split.cpp
@@ -26,6 +27,7 @@ set(SRC_HEADERS
 	bvh8.h
 	bvh_binning.h
 	bvh_build.h
+	bvh_embree.h
 	bvh_node.h
 	bvh_params.h
 	bvh_sort.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index bc73a3ad264..ac0614e3659 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -26,6 +26,10 @@
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_node.h"
 
+#ifdef WITH_EMBREE
+#include "bvh/bvh_embree.h"
+#endif
+
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
@@ -41,6 +45,7 @@ const char *bvh_layout_name(BVHLayout layout)
 		case BVH_LAYOUT_BVH4: return "BVH4";
 		case BVH_LAYOUT_BVH8: return "BVH8";
 		case BVH_LAYOUT_NONE: return "NONE";
+		case BVH_LAYOUT_EMBREE: return "EMBREE";
 		case BVH_LAYOUT_ALL:  return "ALL";
 	}
 	LOG(DFATAL) << "Unsupported BVH layout was passed.";
@@ -96,6 +101,10 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 			return new BVH4(params, objects);
 		case BVH_LAYOUT_BVH8:
 			return new BVH8(params, objects);
+		case BVH_LAYOUT_EMBREE:
+#ifdef WITH_EMBREE
+			return new BVHEmbree(params, objects);
+#endif
 		case BVH_LAYOUT_NONE:
 		case BVH_LAYOUT_ALL:
 			break;
@@ -106,7 +115,7 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 
 /* Building */
 
-void BVH::build(Progress& progress)
+void BVH::build(Progress& progress, Stats*)
 {
 	progress.set_substatus("Building BVH");
 
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 86be0bae4be..c8ad29004d7 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -19,12 +19,13 @@
 #define __BVH_H__
 
 #include "bvh/bvh_params.h"
-
+#include "util/util_array.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class Stats;
 class BVHNode;
 struct BVHStackEntry;
 class BVHParams;
@@ -35,7 +36,6 @@ class Progress;
 
 #define BVH_ALIGN     4096
 #define TRI_NODE_SIZE 3
-
 /* Packed BVH
  *
  * BVH stored as it will be used for traversal on the rendering device. */
@@ -91,7 +91,7 @@ public:
 	static BVH *create(const BVHParams& params, const vector<Object*>& objects);
 	virtual ~BVH() {}
 
-	void build(Progress& progress);
+	virtual void build(Progress& progress, Stats *stats=NULL);
 	void refit(Progress& progress);
 
 protected:
@@ -126,4 +126,4 @@ struct BVHStackEntry
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_H__ */
+#endif  /* __BVH_H__ */
diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h
index df65ddca5b7..ecc697567bb 100644
--- a/intern/cycles/bvh/bvh2.h
+++ b/intern/cycles/bvh/bvh2.h
@@ -84,4 +84,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH2_H__ */
+#endif  /* __BVH2_H__ */
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
index 310909a37e1..28bab2fe327 100644
--- a/intern/cycles/bvh/bvh4.h
+++ b/intern/cycles/bvh/bvh4.h
@@ -84,4 +84,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH4_H__ */
+#endif  /* __BVH4_H__ */
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
index 70d003d938a..b95fe572e27 100644
--- a/intern/cycles/bvh/bvh8.cpp
+++ b/intern/cycles/bvh/bvh8.cpp
@@ -124,6 +124,7 @@ void BVH8::pack_aligned_node(int idx,
 	data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
 	data[0].b = time_from;
 	data[0].c = time_to;
+
 	for(int i = 0; i < num; i++) {
 		float3 bb_min = bounds[i].min;
 		float3 bb_max = bounds[i].max;
@@ -140,8 +141,8 @@ void BVH8::pack_aligned_node(int idx,
 
 	for(int i = num; i < 8; i++) {
 		/* We store BB which would never be recorded as intersection
-		* so kernel might safely assume there are always 4 child nodes.
-		*/
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
 		data[1][i] = FLT_MAX;
 		data[2][i] = -FLT_MAX;
 
@@ -153,6 +154,7 @@ void BVH8::pack_aligned_node(int idx,
 
 		data[7][i] = __int_as_float(0);
 	}
+
 	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
 }
 
@@ -189,6 +191,7 @@ void BVH8::pack_unaligned_node(int idx,
 {
 	float8 data[BVH_UNALIGNED_ONODE_SIZE];
 	memset(data, 0, sizeof(data));
+
 	data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
 	data[0].b = time_from;
 	data[0].c = time_to;
@@ -222,21 +225,21 @@ void BVH8::pack_unaligned_node(int idx,
 		 * so kernel might safely assume there are always 4 child nodes.
 		 */
 
-		data[1][i] = 1.0f;
-		data[2][i] = 0.0f;
-		data[3][i] = 0.0f;
+		data[1][i] = NAN;
+		data[2][i] = NAN;
+		data[3][i] = NAN;
 
-		data[4][i] = 0.0f;
-		data[5][i] = 0.0f;
-		data[6][i] = 0.0f;
+		data[4][i] = NAN;
+		data[5][i] = NAN;
+		data[6][i] = NAN;
 
-		data[7][i] = 0.0f;
-		data[8][i] = 0.0f;
-		data[9][i] = 0.0f;
+		data[7][i] = NAN;
+		data[8][i] = NAN;
+		data[9][i] = NAN;
 
-		data[10][i] = -FLT_MAX;
-		data[11][i] = -FLT_MAX;
-		data[12][i] = -FLT_MAX;
+		data[10][i] = NAN;
+		data[11][i] = NAN;
+		data[12][i] = NAN;
 
 		data[13][i] = __int_as_float(0);
 	}
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
index 274a2442c7e..834daf3abce 100644
--- a/intern/cycles/bvh/bvh8.h
+++ b/intern/cycles/bvh/bvh8.h
@@ -95,4 +95,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH8_H__ */
+#endif  /* __BVH8_H__ */
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 7b245139819..dd95a5cc0e8 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -23,6 +23,7 @@
 #include "bvh/bvh_params.h"
 #include "bvh/bvh_unaligned.h"
 
+#include "util/util_array.h"
 #include "util/util_task.h"
 #include "util/util_vector.h"
 
@@ -142,4 +143,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_BUILD_H__ */
+#endif  /* __BVH_BUILD_H__ */
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
new file mode 100644
index 00000000000..7489fe8ea42
--- /dev/null
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -0,0 +1,884 @@
+/*
+ * Copyright 2018, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This class implemens a ray accelerator for Cycles using Intel's Embree library.
+ * It supports triangles, curves, object and deformation blur and instancing.
+ * Not supported are thick line segments, those have no native equivalent in Embree.
+ * They could be implemented using Embree's thick curves, at the expense of wasted memory.
+ * User defined intersections for Embree could also be an option, but since Embree only uses aligned BVHs
+ * for user geometry, this would come with reduced performance and/or higher memory usage.
+ *
+ * Since Embree allows object to be either curves or triangles but not both, Cycles object IDs are maapped
+ * to Embree IDs by multiplying by two and adding one for curves.
+ *
+ * This implementation shares RTCDevices between Cycles instances. Eventually each instance should get
+ * a separate RTCDevice to correctly keep track of memory usage.
+ *
+ * Vertex and index buffers are duplicated between Cycles device arrays and Embree. These could be merged,
+ * which would requrie changes to intersection refinement, shader setup, mesh light sampling and a few
+ * other places in Cycles where direct access to vertex data is required.
+ */
+
+#ifdef WITH_EMBREE
+
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+#include <embree3/rtcore_geometry.h>
+
+#include "bvh/bvh_embree.h"
+
+/* Kernel includes are necessary so that the filter function for Embree can access the packed BVH. */
+#include "kernel/bvh/bvh_embree.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_random.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define IS_HAIR(x) (x & 1)
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls.
+ */
+static void rtc_filter_func(const RTCFilterFunctionNArguments *args)
+{
+	/* Current implementation in Cycles assumes only single-ray intersection queries. */
+	assert(args->N == 1);
+
+	const RTCRay *ray = (RTCRay*)args->ray;
+	const RTCHit *hit = (RTCHit*)args->hit;
+	CCLIntersectContext *ctx = ((IntersectContext*)args->context)->userRayExt;
+	KernelGlobals *kg = ctx->kg;
+
+	/* Check if there is backfacing hair to ignore. */
+	if(IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+	   && !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING)
+	   && !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) {
+		if(dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+			*args->valid = 0;
+			return;
+		}
+	}
+}
+
+static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments* args)
+{
+	assert(args->N == 1);
+
+	const RTCRay *ray = (RTCRay*)args->ray;
+	RTCHit *hit = (RTCHit*)args->hit;
+	CCLIntersectContext *ctx = ((IntersectContext*)args->context)->userRayExt;
+	KernelGlobals *kg = ctx->kg;
+
+	/* For all ray types: Check if there is backfacing hair to ignore */
+	if(IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+	   && !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING)
+	   && !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) {
+		if(dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+			*args->valid = 0;
+			return;
+		}
+	}
+
+	switch(ctx->type) {
+		case CCLIntersectContext::RAY_SHADOW_ALL: {
+			/* Append the intersection to the end of the array. */
+			if(ctx->num_hits < ctx->max_hits) {
+				Intersection current_isect;
+				kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+				for(size_t i = 0; i < ctx->max_hits; ++i) {
+					if(current_isect.object == ctx->isect_s[i].object &&
+					   current_isect.prim == ctx->isect_s[i].prim &&
+					   current_isect.t == ctx->isect_s[i].t) {
+						/* This intersection was already recorded, skip it. */
+						*args->valid = 0;
+						break;
+					}
+				}
+				Intersection *isect = &ctx->isect_s[ctx->num_hits];
+				++ctx->num_hits;
+				*isect = current_isect;
+				int prim = kernel_tex_fetch(__prim_index, isect->prim);
+				int shader = 0;
+				if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
+					shader = kernel_tex_fetch(__tri_shader, prim);
+				}
+				else {
+					float4 str = kernel_tex_fetch(__curves, prim);
+					shader = __float_as_int(str.z);
+				}
+				int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags;
+				/* If no transparent shadows, all light is blocked. */
+				if(flag & (SD_HAS_TRANSPARENT_SHADOW)) {
+					/* This tells Embree to continue tracing. */
+					*args->valid = 0;
+				}
+			}
+			else {
+				/* Increase the number of hits beyond ray.max_hits
+				 * so that the caller can detect this as opaque. */
+				++ctx->num_hits;
+			}
+			break;
+		}
+		case CCLIntersectContext::RAY_SSS: {
+			/* No intersection information requested, just return a hit. */
+			if(ctx->max_hits == 0) {
+				break;
+			}
+
+			/* See triangle_intersect_subsurface() for the native equivalent. */
+			for(int i = min(ctx->max_hits, ctx->ss_isect->num_hits) - 1; i >= 0; --i) {
+				if(ctx->ss_isect->hits[i].t == ray->tfar) {
+					/* This tells Embree to continue tracing. */
+					*args->valid = 0;
+					break;
+				}
+			}
+
+			++ctx->ss_isect->num_hits;
+			int hit_idx;
+
+			if(ctx->ss_isect->num_hits <= ctx->max_hits) {
+				hit_idx = ctx->ss_isect->num_hits - 1;
+			}
+			else {
+				/* reservoir sampling: if we are at the maximum number of
+				 * hits, randomly replace element or skip it */
+				hit_idx = lcg_step_uint(ctx->lcg_state) % ctx->ss_isect->num_hits;
+
+				if(hit_idx >= ctx->max_hits) {
+					/* This tells Embree to continue tracing. */
+					*args->valid = 0;
+					break;
+				}
+			}
+			/* record intersection */
+			kernel_embree_convert_local_hit(kg, ray, hit, &ctx->ss_isect->hits[hit_idx], ctx->sss_object_id);
+			ctx->ss_isect->Ng[hit_idx].x = hit->Ng_x;
+			ctx->ss_isect->Ng[hit_idx].y = hit->Ng_y;
+			ctx->ss_isect->Ng[hit_idx].z = hit->Ng_z;
+			ctx->ss_isect->Ng[hit_idx] = normalize(ctx->ss_isect->Ng[hit_idx]);
+			/* This tells Embree to continue tracing .*/
+			*args->valid = 0;
+			break;
+		}
+		case CCLIntersectContext::RAY_VOLUME_ALL: {
+			/* Append the intersection to the end of the array. */
+			if(ctx->num_hits < ctx->max_hits) {
+				Intersection current_isect;
+				kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+				for(size_t i = 0; i < ctx->max_hits; ++i) {
+					if(current_isect.object == ctx->isect_s[i].object &&
+					   current_isect.prim == ctx->isect_s[i].prim &&
+					   current_isect.t == ctx->isect_s[i].t) {
+						/* This intersection was already recorded, skip it. */
+						*args->valid = 0;
+						break;
+					}
+				}
+				Intersection *isect = &ctx->isect_s[ctx->num_hits];
+				++ctx->num_hits;
+				*isect = current_isect;
+				/* Only primitives from volume object. */
+				uint tri_object = (isect->object == OBJECT_NONE) ?
+								   kernel_tex_fetch(__prim_object, isect->prim) : isect->object;
+				int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+				if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+					--ctx->num_hits;
+				}
+				/* This tells Embree to continue tracing. */
+				*args->valid = 0;
+				break;
+			}
+		}
+		case CCLIntersectContext::RAY_REGULAR:
+		default:
+			/* Nothing to do here. */
+			break;
+	}
+}
+
+static size_t unaccounted_mem = 0;
+
+static bool rtc_memory_monitor_func(void* userPtr, const ssize_t bytes, const bool)
+{
+	Stats *stats = (Stats*)userPtr;
+	if(stats) {
+		if(bytes > 0) {
+			stats->mem_alloc(bytes);
+		}
+		else {
+			stats->mem_free(-bytes);
+		}
+	}
+	else {
+		/* A stats pointer may not yet be available. Keep track of the memory usage for later. */
+		if(bytes >= 0) {
+			atomic_add_and_fetch_z(&unaccounted_mem, bytes);
+		}
+		else {
+			atomic_sub_and_fetch_z(&unaccounted_mem, -bytes);
+		}
+	}
+	return true;
+}
+
+static void rtc_error_func(void*, enum RTCError, const char* str)
+{
+	VLOG(1) << str;
+}
+
+static double progress_start_time = 0.0f;
+
+static bool rtc_progress_func(void* user_ptr, const double n)
+{
+	Progress *progress = (Progress*)user_ptr;
+
+	if(time_dt() - progress_start_time < 0.25) {
+		return true;
+	}
+
+	string msg = string_printf("Building BVH %.0f%%", n * 100.0);
+	progress->set_substatus(msg);
+	progress_start_time = time_dt();
+
+	return !progress->get_cancel();
+}
+
+/* This is to have a shared device between all BVH instances.
+   It would be useful to actually to use a separte RTCDevice per Cycles instance. */
+RTCDevice BVHEmbree::rtc_shared_device = NULL;
+int BVHEmbree::rtc_shared_users = 0;
+thread_mutex BVHEmbree::rtc_shared_mutex;
+
+BVHEmbree::BVHEmbree(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_), scene(NULL), mem_used(0), top_level(NULL), stats(NULL),
+  curve_subdivisions(params.curve_subdivisions), build_quality(RTC_BUILD_QUALITY_REFIT),
+  use_curves(params_.curve_flags & CURVE_KN_INTERPOLATE),
+  use_ribbons(params.curve_flags & CURVE_KN_RIBBONS), dynamic_scene(true)
+{
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+	thread_scoped_lock lock(rtc_shared_mutex);
+	if(rtc_shared_users == 0) {
+		rtc_shared_device = rtcNewDevice("verbose=0");
+		/* Check here if Embree was built with the correct flags. */
+		ssize_t ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED);
+		if(ret != 1) {
+			assert(0);
+			VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED flag."\
+			           "Ray visiblity will not work.";
+		}
+		ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED);
+		if(ret != 1) {
+			assert(0);
+			VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED flag."\
+			           "Renders may not look as expected.";
+		}
+		ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED);
+		if(ret != 1) {
+			assert(0);
+			VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED flag. "\
+			           "Line primitives will not be rendered.";
+		}
+		ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED);
+		if(ret != 1) {
+			assert(0);
+			VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED flag. "\
+			           "Triangle primitives will not be rendered.";
+		}
+		ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED);
+		if(ret != 0) {
+			assert(0);
+			VLOG(1) << "Embree is compiled with the RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED flag. "\
+			           "Renders may not look as expected.";
+		}
+	}
+	++rtc_shared_users;
+
+	rtcSetDeviceErrorFunction(rtc_shared_device, rtc_error_func, NULL);
+
+	pack.root_index = -1;
+}
+
+BVHEmbree::~BVHEmbree()
+{
+	if(!params.top_level) {
+		destroy(scene);
+	}
+}
+
+void BVHEmbree::destroy(RTCScene scene)
+{
+	if(scene) {
+		rtcReleaseScene(scene);
+		scene = NULL;
+	}
+	thread_scoped_lock lock(rtc_shared_mutex);
+	--rtc_shared_users;
+	if(rtc_shared_users == 0) {
+		rtcReleaseDevice (rtc_shared_device);
+		rtc_shared_device = NULL;
+	}
+}
+
+void BVHEmbree::delete_rtcScene()
+{
+	if(scene) {
+		/* When this BVH is used as an instance in a top level BVH, don't delete now
+		 * Let the top_level BVH know that it should delete it later. */
+		if(top_level) {
+			top_level->add_delayed_delete_scene(scene);
+		}
+		else {
+			rtcReleaseScene(scene);
+			if(delayed_delete_scenes.size()) {
+				foreach(RTCScene s, delayed_delete_scenes) {
+					rtcReleaseScene(s);
+				}
+			}
+			delayed_delete_scenes.clear();
+		}
+		scene = NULL;
+	}
+}
+
+void BVHEmbree::build(Progress& progress, Stats *stats_)
+{
+	assert(rtc_shared_device);
+	stats = stats_;
+	rtcSetDeviceMemoryMonitorFunction(rtc_shared_device, rtc_memory_monitor_func, stats);
+
+	progress.set_substatus("Building BVH");
+
+	if(scene) {
+		rtcReleaseScene(scene);
+		scene = NULL;
+	}
+
+	const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC;
+
+	scene = rtcNewScene(rtc_shared_device);
+	const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) |
+	                                   RTC_SCENE_FLAG_COMPACT | RTC_SCENE_FLAG_ROBUST;
+	rtcSetSceneFlags(scene, scene_flags);
+	build_quality = dynamic ? RTC_BUILD_QUALITY_LOW :
+	               (params.use_spatial_split ? RTC_BUILD_QUALITY_HIGH : RTC_BUILD_QUALITY_MEDIUM);
+	rtcSetSceneBuildQuality(scene, build_quality);
+
+	int i = 0;
+
+	pack.object_node.clear();
+
+	foreach(Object *ob, objects) {
+		if(params.top_level) {
+			if(!ob->is_traceable()) {
+				++i;
+				continue;
+			}
+			if(!ob->mesh->is_instanced()) {
+				add_object(ob, i);
+			}
+			else {
+				add_instance(ob, i);
+			}
+		}
+		else {
+			add_object(ob, i);
+		}
+		++i;
+		if(progress.get_cancel()) return;
+	}
+
+	if(progress.get_cancel()) {
+		delete_rtcScene();
+		stats = NULL;
+		return;
+	}
+
+	rtcSetSceneProgressMonitorFunction(scene, rtc_progress_func, &progress);
+	rtcCommitScene(scene);
+
+	pack_primitives();
+
+	if(progress.get_cancel()) {
+		delete_rtcScene();
+		stats = NULL;
+		return;
+	}
+
+	progress.set_substatus("Packing geometry");
+	pack_nodes(NULL);
+
+	stats = NULL;
+}
+
+void BVHEmbree::add_object(Object *ob, int i)
+{
+	Mesh *mesh = ob->mesh;
+	if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
+		add_triangles(ob, i);
+	}
+	if(params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
+		add_curves(ob, i);
+	}
+}
+
+void BVHEmbree::add_instance(Object *ob, int i)
+{
+	if(!ob || !ob->mesh) {
+		assert(0);
+		return;
+	}
+	BVHEmbree *instance_bvh = (BVHEmbree*)(ob->mesh->bvh);
+
+	if(instance_bvh->top_level != this) {
+		instance_bvh->top_level = this;
+	}
+
+	const size_t num_motion_steps = ob->use_motion() ? ob->motion.size() : 1;
+	RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_INSTANCE);
+	rtcSetGeometryInstancedScene(geom_id, instance_bvh->scene);
+	rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
+
+	if(ob->use_motion()) {
+		for(size_t step = 0; step < num_motion_steps; ++step) {
+			rtcSetGeometryTransform(geom_id, step, RTC_FORMAT_FLOAT3X4_ROW_MAJOR, (const float*)&ob->motion[step]);
+		}
+	}
+	else {
+		rtcSetGeometryTransform(geom_id, 0, RTC_FORMAT_FLOAT3X4_ROW_MAJOR, (const float*)&ob->tfm);
+	}
+
+	pack.prim_index.push_back_slow(-1);
+	pack.prim_object.push_back_slow(i);
+	pack.prim_type.push_back_slow(PRIMITIVE_NONE);
+	pack.prim_tri_index.push_back_slow(-1);
+
+	rtcSetGeometryUserData(geom_id, (void*) instance_bvh->scene);
+	rtcSetGeometryMask(geom_id, ob->visibility);
+
+	rtcCommitGeometry(geom_id);
+	rtcAttachGeometryByID(scene, geom_id, i*2);
+	rtcReleaseGeometry(geom_id);
+}
+
+void BVHEmbree::add_triangles(Object *ob, int i)
+{
+	size_t prim_offset = pack.prim_index.size();
+	Mesh *mesh = ob->mesh;
+	const Attribute *attr_mP = NULL;
+	size_t num_motion_steps = 1;
+	if(mesh->has_motion_blur()) {
+		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(attr_mP) {
+			num_motion_steps = mesh->motion_steps;
+			if(num_motion_steps > RTC_MAX_TIME_STEP_COUNT) {
+				assert(0);
+				num_motion_steps = RTC_MAX_TIME_STEP_COUNT;
+			}
+		}
+	}
+
+	const size_t num_triangles = mesh->num_triangles();
+	RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+	rtcSetGeometryBuildQuality(geom_id, build_quality);
+	rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
+
+	unsigned *rtc_indices = (unsigned*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_INDEX, 0,
+	                                                           RTC_FORMAT_UINT3, sizeof (int) * 3, num_triangles);
+	assert(rtc_indices);
+	if(!rtc_indices) {
+		VLOG(1) << "Embree could not create new geometry buffer for mesh " << mesh->name.c_str() << ".\n";
+		return;
+	}
+	for(size_t j = 0; j < num_triangles; ++j) {
+		Mesh::Triangle t = mesh->get_triangle(j);
+		rtc_indices[j*3] = t.v[0];
+		rtc_indices[j*3+1] = t.v[1];
+		rtc_indices[j*3+2] = t.v[2];
+	}
+
+	update_tri_vertex_buffer(geom_id, mesh);
+
+	pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
+	pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
+	pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
+	pack.prim_tri_index.reserve(pack.prim_index.size() + num_triangles);
+	for(size_t j = 0; j < num_triangles; ++j) {
+		pack.prim_object.push_back_reserved(i);
+		pack.prim_type.push_back_reserved(num_motion_steps > 1 ? PRIMITIVE_MOTION_TRIANGLE : PRIMITIVE_TRIANGLE);
+		pack.prim_index.push_back_reserved(j);
+		pack.prim_tri_index.push_back_reserved(j);
+	}
+
+	rtcSetGeometryUserData(geom_id, (void*) prim_offset);
+	rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
+	rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+	rtcSetGeometryMask(geom_id, ob->visibility);
+
+	rtcCommitGeometry(geom_id);
+	rtcAttachGeometryByID(scene, geom_id, i*2);
+	rtcReleaseGeometry(geom_id);
+}
+
+void BVHEmbree::update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh)
+{
+	const Attribute *attr_mP = NULL;
+	size_t num_motion_steps = 1;
+	int t_mid = 0;
+	if(mesh->has_motion_blur()) {
+		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(attr_mP) {
+			num_motion_steps = mesh->motion_steps;
+			t_mid = (num_motion_steps - 1) / 2;
+			if(num_motion_steps > RTC_MAX_TIME_STEP_COUNT) {
+				assert(0);
+				num_motion_steps = RTC_MAX_TIME_STEP_COUNT;
+			}
+		}
+	}
+	const size_t num_verts = mesh->verts.size();
+
+	for(int t = 0; t < num_motion_steps; ++t) {
+		const float3 *verts;
+		if(t == t_mid) {
+			verts = &mesh->verts[0];
+		}
+		else {
+			int t_ = (t > t_mid) ? (t - 1) : t;
+			verts = &attr_mP->data_float3()[t_ * num_verts];
+		}
+
+		float *rtc_verts = (float*) rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t,
+		                                                    RTC_FORMAT_FLOAT3, sizeof(float) * 3, num_verts + 1);
+		assert(rtc_verts);
+		if(rtc_verts) {
+			for(size_t j = 0; j < num_verts; ++j) {
+				rtc_verts[0] = verts[j].x;
+				rtc_verts[1] = verts[j].y;
+				rtc_verts[2] = verts[j].z;
+				rtc_verts += 3;
+			}
+		}
+	}
+}
+
+void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh)
+{
+	const Attribute *attr_mP = NULL;
+	size_t num_motion_steps = 1;
+	if(mesh->has_motion_blur()) {
+		attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(attr_mP) {
+			num_motion_steps = mesh->motion_steps;
+		}
+	}
+	
+	const size_t num_curves = mesh->num_curves();
+	size_t num_keys = 0;
+	for(size_t j = 0; j < num_curves; ++j) {
+		const Mesh::Curve c = mesh->get_curve(j);
+		num_keys += c.num_keys;
+	}
+
+	/* Copy the CV data to Embree */
+	const int t_mid = (num_motion_steps - 1) / 2;
+	const float *curve_radius = &mesh->curve_radius[0];
+	for(int t = 0; t < num_motion_steps; ++t) {
+		const float3 *verts;
+		if(t == t_mid || attr_mP == NULL) {
+			verts = &mesh->curve_keys[0];
+		}
+		else {
+			int t_ = (t > t_mid) ? (t - 1) : t;
+			verts = &attr_mP->data_float3()[t_ * num_keys];
+		}
+
+		float4 *rtc_verts = (float4*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t,
+		                                                     RTC_FORMAT_FLOAT4, sizeof (float) * 4, num_keys);
+		float4 *rtc_tangents = NULL;
+		if(use_curves) {
+			rtc_tangents = (float4*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_TANGENT, t,
+																RTC_FORMAT_FLOAT4, sizeof (float) * 4, num_keys);
+			assert(rtc_tangents);
+		}
+		assert(rtc_verts);
+		if(rtc_verts) {
+			if(use_curves && rtc_tangents) {
+				const size_t num_curves = mesh->num_curves();
+				for(size_t j = 0; j < num_curves; ++j) {
+					Mesh::Curve c = mesh->get_curve(j);
+					int fk = c.first_key;
+					rtc_verts[0] = float3_to_float4(verts[fk]);
+					rtc_verts[0].w = curve_radius[fk];
+					rtc_tangents[0] = float3_to_float4(verts[fk + 1] - verts[fk]);
+					rtc_tangents[0].w = curve_radius[fk + 1] - curve_radius[fk];
+					++fk;
+					int k = 1;
+					for(;k < c.num_segments(); ++k, ++fk) {
+						rtc_verts[k] = float3_to_float4(verts[fk]);
+						rtc_verts[k].w = curve_radius[fk];
+						rtc_tangents[k] = float3_to_float4((verts[fk + 1] - verts[fk - 1]) * 0.5f);
+						rtc_tangents[k].w = (curve_radius[fk + 1] - curve_radius[fk - 1]) * 0.5f;
+					}
+					rtc_verts[k] = float3_to_float4(verts[fk]);
+					rtc_verts[k].w = curve_radius[fk];
+					rtc_tangents[k] = float3_to_float4(verts[fk] - verts[fk - 1]);
+					rtc_tangents[k].w = curve_radius[fk] - curve_radius[fk - 1];
+					rtc_verts += c.num_keys;
+					rtc_tangents += c.num_keys;
+				}
+			}
+			else {
+				for(size_t j = 0; j < num_keys; ++j) {
+					rtc_verts[j] = float3_to_float4(verts[j]);
+					rtc_verts[j].w = curve_radius[j];
+				}
+			}
+		}
+	}
+}
+
+void BVHEmbree::add_curves(Object *ob, int i)
+{
+	size_t prim_offset = pack.prim_index.size();
+	const Mesh *mesh = ob->mesh;
+	const Attribute *attr_mP = NULL;
+	size_t num_motion_steps = 1;
+	if(mesh->has_motion_blur()) {
+		attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(attr_mP) {
+			num_motion_steps = mesh->motion_steps;
+		}
+	}
+
+	const size_t num_curves = mesh->num_curves();
+	size_t num_segments = 0;
+	for(size_t j = 0; j < num_curves; ++j) {
+		Mesh::Curve c = mesh->get_curve(j);
+		assert(c.num_segments() > 0);
+		num_segments += c.num_segments();
+	}
+
+	/* Make room for Cycles specific data. */
+	pack.prim_object.reserve(pack.prim_object.size() + num_segments);
+	pack.prim_type.reserve(pack.prim_type.size() + num_segments);
+	pack.prim_index.reserve(pack.prim_index.size() + num_segments);
+	pack.prim_tri_index.reserve(pack.prim_index.size() + num_segments);
+
+	enum RTCGeometryType type = (!use_curves) ? RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE :
+	                            (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE :
+	                                           RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE);
+
+	RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type);
+	rtcSetGeometryTessellationRate(geom_id, curve_subdivisions);
+	unsigned *rtc_indices = (unsigned*) rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_INDEX, 0,
+																RTC_FORMAT_UINT, sizeof (int), num_segments);
+	size_t rtc_index = 0;
+	for(size_t j = 0; j < num_curves; ++j) {
+		Mesh::Curve c = mesh->get_curve(j);
+		for(size_t k = 0; k < c.num_segments(); ++k) {
+			rtc_indices[rtc_index] = c.first_key + k;
+			/* Cycles specific data. */
+			pack.prim_object.push_back_reserved(i);
+			pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(num_motion_steps > 1 ?
+																	 PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k));
+			pack.prim_index.push_back_reserved(j);
+			pack.prim_tri_index.push_back_reserved(rtc_index);
+
+			++rtc_index;
+		}
+	}
+
+	rtcSetGeometryBuildQuality(geom_id, build_quality);
+	rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
+
+	update_curve_vertex_buffer(geom_id, mesh);
+
+	rtcSetGeometryUserData(geom_id, (void*) prim_offset);
+	rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
+	rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+	rtcSetGeometryMask(geom_id, ob->visibility);
+
+	rtcCommitGeometry(geom_id);
+	rtcAttachGeometryByID(scene, geom_id, i * 2 + 1);
+	rtcReleaseGeometry(geom_id);
+}
+
+void BVHEmbree::pack_nodes(const BVHNode *)
+{
+	/* Quite a bit of this code is for compatibility with Cycles' native BVH. */
+	if(!params.top_level) {
+		return;
+	}
+
+	for(size_t i = 0; i < pack.prim_index.size(); ++i) {
+		if(pack.prim_index[i] != -1) {
+			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
+				pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset;
+			else
+				pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset;
+		}
+	}
+
+	size_t prim_offset = pack.prim_index.size();
+
+	/* reserve */
+	size_t prim_index_size = pack.prim_index.size();
+	size_t prim_tri_verts_size = pack.prim_tri_verts.size();
+
+	size_t pack_prim_index_offset = prim_index_size;
+	size_t pack_prim_tri_verts_offset = prim_tri_verts_size;
+	size_t object_offset = 0;
+
+	map<Mesh*, int> mesh_map;
+
+	foreach(Object *ob, objects) {
+		Mesh *mesh = ob->mesh;
+		BVH *bvh = mesh->bvh;
+
+		if(mesh->need_build_bvh()) {
+			if(mesh_map.find(mesh) == mesh_map.end()) {
+				prim_index_size += bvh->pack.prim_index.size();
+				prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
+				mesh_map[mesh] = 1;
+			}
+		}
+	}
+
+	mesh_map.clear();
+
+	pack.prim_index.resize(prim_index_size);
+	pack.prim_type.resize(prim_index_size);
+	pack.prim_object.resize(prim_index_size);
+	pack.prim_visibility.clear();
+	pack.prim_tri_verts.resize(prim_tri_verts_size);
+	pack.prim_tri_index.resize(prim_index_size);
+	pack.object_node.resize(objects.size());
+
+	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
+	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
+	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
+	float4 *pack_prim_tri_verts = (pack.prim_tri_verts.size())? &pack.prim_tri_verts[0]: NULL;
+	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
+
+	/* merge */
+	foreach(Object *ob, objects) {
+		Mesh *mesh = ob->mesh;
+
+		/* We assume that if mesh doesn't need own BVH it was already included
+		 * into a top-level BVH and no packing here is needed.
+		 */
+		if(!mesh->need_build_bvh()) {
+			pack.object_node[object_offset++] = prim_offset;
+			continue;
+		}
+
+		/* if mesh already added once, don't add it again, but used set
+		 * node offset for this object */
+		map<Mesh*, int>::iterator it = mesh_map.find(mesh);
+
+		if(mesh_map.find(mesh) != mesh_map.end()) {
+			int noffset = it->second;
+			pack.object_node[object_offset++] = noffset;
+			continue;
+		}
+
+		BVHEmbree *bvh = (BVHEmbree*)mesh->bvh;
+
+		rtc_memory_monitor_func(stats, unaccounted_mem, true);
+		unaccounted_mem = 0;
+
+		int mesh_tri_offset = mesh->tri_offset;
+		int mesh_curve_offset = mesh->curve_offset;
+
+		/* fill in node indexes for instances */
+		pack.object_node[object_offset++] = prim_offset;
+
+		mesh_map[mesh] = pack.object_node[object_offset-1];
+
+		/* merge primitive, object and triangle indexes */
+		if(bvh->pack.prim_index.size()) {
+			size_t bvh_prim_index_size = bvh->pack.prim_index.size();
+			int *bvh_prim_index = &bvh->pack.prim_index[0];
+			int *bvh_prim_type = &bvh->pack.prim_type[0];
+			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
+
+			for(size_t i = 0; i < bvh_prim_index_size; ++i) {
+				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
+					pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
+					pack_prim_tri_index[pack_prim_index_offset] = -1;
+				}
+				else {
+					pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
+					pack_prim_tri_index[pack_prim_index_offset] =
+					bvh_prim_tri_index[i] + pack_prim_tri_verts_offset;
+				}
+
+				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
+				pack_prim_object[pack_prim_index_offset] = 0;
+
+				++pack_prim_index_offset;
+			}
+		}
+
+		/* Merge triangle vertices data. */
+		if(bvh->pack.prim_tri_verts.size()) {
+			const size_t prim_tri_size = bvh->pack.prim_tri_verts.size();
+			memcpy(pack_prim_tri_verts + pack_prim_tri_verts_offset,
+				   &bvh->pack.prim_tri_verts[0],
+				   prim_tri_size*sizeof(float4));
+			pack_prim_tri_verts_offset += prim_tri_size;
+		}
+
+		prim_offset += bvh->pack.prim_index.size();
+	}
+}
+
+void BVHEmbree::refit_nodes()
+{
+	/* Update all vertex buffers, then tell Embree to rebuild/-fit the BVHs. */
+	unsigned geom_id = 0;
+	foreach(Object *ob, objects) {
+		if(!params.top_level || (ob->is_traceable() && !ob->mesh->is_instanced())) {
+			if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) {
+				update_tri_vertex_buffer(rtcGetGeometry(scene, geom_id), ob->mesh);
+				rtcCommitGeometry(rtcGetGeometry(scene,geom_id));
+			}
+
+			if(params.primitive_mask & PRIMITIVE_ALL_CURVE && ob->mesh->num_curves() > 0) {
+				update_curve_vertex_buffer(rtcGetGeometry(scene, geom_id+1), ob->mesh);
+				rtcCommitGeometry(rtcGetGeometry(scene,geom_id+1));
+			}
+		}
+		geom_id += 2;
+	}
+	rtcCommitScene(scene);
+}
+CCL_NAMESPACE_END
+
+#endif  /* WITH_EMBREE */
diff --git a/intern/cycles/bvh/bvh_embree.h b/intern/cycles/bvh/bvh_embree.h
new file mode 100644
index 00000000000..9990826ba98
--- /dev/null
+++ b/intern/cycles/bvh/bvh_embree.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2018, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_EMBREE_H__
+#define __BVH_EMBREE_H__
+
+#ifdef WITH_EMBREE
+
+#include <embree3/rtcore.h>
+#include <embree3/rtcore_scene.h>
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Mesh;
+
+class BVHEmbree : public BVH
+{
+public:
+	virtual void build(Progress& progress, Stats *stats) override;
+	virtual ~BVHEmbree();
+	RTCScene scene;
+	static void destroy(RTCScene);
+protected:
+	friend class BVH;
+	BVHEmbree(const BVHParams& params, const vector<Object*>& objects);
+
+	virtual void pack_nodes(const BVHNode*) override;
+	virtual void refit_nodes() override;
+
+	void add_object(Object *ob, int i);
+	void add_instance(Object *ob, int i);
+	void add_curves(Object *ob, int i);
+	void add_triangles(Object *ob, int i);
+
+	ssize_t mem_used;
+
+	void add_delayed_delete_scene(RTCScene scene) { delayed_delete_scenes.push_back(scene); }
+	BVHEmbree *top_level;
+private:
+	void delete_rtcScene();
+	void update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh);
+	void update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh);
+
+	static RTCDevice rtc_shared_device;
+	static int rtc_shared_users;
+	static thread_mutex rtc_shared_mutex;
+
+	Stats *stats;
+	vector<RTCScene> delayed_delete_scenes;
+	int curve_subdivisions;
+	enum RTCBuildQuality build_quality;
+	bool use_curves, use_ribbons, dynamic_scene;
+};
+
+CCL_NAMESPACE_END
+
+#endif  /* WITH_EMBREE */
+
+#endif  /* __BVH_EMBREE_H__ */
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index ed89d52a50a..65d5df01158 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -169,4 +169,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_NODE_H__ */
+#endif  /* __BVH_NODE_H__ */
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index d8dd7df6ba1..6408d56da80 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -90,6 +90,13 @@ public:
 	/* Same as above, but for triangle primitives. */
 	int num_motion_triangle_steps;
 
+	/* Same as in SceneParams. */
+	int bvh_type;
+
+	/* These are needed for Embree. */
+	int curve_flags;
+	int curve_subdivisions;
+
 	/* fixed parameters */
 	enum {
 		MAX_DEPTH = 64,
@@ -123,6 +130,11 @@ public:
 
 		num_motion_curve_steps = 0;
 		num_motion_triangle_steps = 0;
+
+		bvh_type = 0;
+
+		curve_flags = 0;
+		curve_subdivisions = 4;
 	}
 
 	/* SAH costs */
@@ -274,4 +286,4 @@ struct BVHSpatialStorage {
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_PARAMS_H__ */
+#endif  /* __BVH_PARAMS_H__ */
diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h
index 936401d8607..6910cc1e9b4 100644
--- a/intern/cycles/bvh/bvh_sort.h
+++ b/intern/cycles/bvh/bvh_sort.h
@@ -35,4 +35,4 @@ void bvh_reference_sort(int start,
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_SORT_H__ */
+#endif  /* __BVH_SORT_H__ */
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index a874a118b99..cb47deab211 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -259,4 +259,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_SPLIT_H__ */
+#endif  /* __BVH_SPLIT_H__ */
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
index c3ece051cd5..bcfb6ed68da 100644
--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -77,4 +77,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __BVH_UNALIGNED_H__ */
+#endif  /* __BVH_UNALIGNED_H__ */
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 2e386a6bfc5..d0f473a2939 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -133,6 +133,12 @@ if(CYCLES_STANDALONE_REPOSITORY)
 	set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB")
 
 	####
+	# embree
+	if(WITH_CYCLES_EMBREE)
+		find_package(embree 3.2.4 REQUIRED)
+	endif()
+
+	####
 	# Logging
 	if(WITH_CYCLES_LOGGING)
 		find_package(Glog REQUIRED)
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 6959dd73c32..7e20bb449c3 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -361,7 +361,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 
 	info.has_half_images = true;
 	info.has_volume_decoupled = true;
-	info.bvh_layout_mask = BVH_LAYOUT_ALL;
 	info.has_osl = true;
 
 	foreach(const DeviceInfo &device, subdevices) {
@@ -396,7 +395,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 		/* Accumulate device info. */
 		info.has_half_images &= device.has_half_images;
 		info.has_volume_decoupled &= device.has_volume_decoupled;
-		info.bvh_layout_mask = device.bvh_layout_mask & info.bvh_layout_mask;
 		info.has_osl &= device.has_osl;
 	}
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2400788c833..f3fb338e638 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -58,7 +58,6 @@ public:
 	bool advanced_shading;          /* Supports full shading system. */
 	bool has_half_images;           /* Support half-float textures. */
 	bool has_volume_decoupled;      /* Decoupled volume shading. */
-	BVHLayoutMask bvh_layout_mask;  /* Bitmask of supported BVH layouts. */
 	bool has_osl;                   /* Support Open Shading Language. */
 	bool use_split_kernel;          /* Use split or mega kernel. */
 	int cpu_threads;
@@ -74,7 +73,6 @@ public:
 		advanced_shading = true;
 		has_half_images = false;
 		has_volume_decoupled = false;
-		bvh_layout_mask = BVH_LAYOUT_NONE;
 		has_osl = false;
 		use_split_kernel = false;
 	}
@@ -183,7 +181,7 @@ public:
 	/* Convert the requested features structure to a build options,
 	 * which could then be passed to compilers.
 	 */
-	string get_build_options(void) const
+	string get_build_options() const
 	{
 		string build_options = "";
 		if(experimental) {
@@ -242,8 +240,8 @@ std::ostream& operator <<(std::ostream &os,
 /* Device */
 
 struct DeviceDrawParams {
-	function<void(void)> bind_display_space_shader_cb;
-	function<void(void)> unbind_display_space_shader_cb;
+	function<void()> bind_display_space_shader_cb;
+	function<void()> unbind_display_space_shader_cb;
 };
 
 class Device {
@@ -281,6 +279,7 @@ public:
 		fflush(stderr);
 	}
 	virtual bool show_samples() const { return false; }
+	virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
 
 	/* statistics */
 	Stats &stats;
@@ -361,4 +360,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __DEVICE_H__ */
+#endif  /* __DEVICE_H__ */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 7c72ab1a009..76f6466bbde 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -41,6 +41,7 @@
 #include "kernel/osl/osl_globals.h"
 
 #include "render/buffers.h"
+#include "render/coverage.h"
 
 #include "util/util_debug.h"
 #include "util/util_foreach.h"
@@ -80,11 +81,11 @@ public:
 
 		/* Silence potential warnings about unused variables
 		 * when compiling without some architectures. */
-		(void)kernel_sse2;
-		(void)kernel_sse3;
-		(void)kernel_sse41;
-		(void)kernel_avx;
-		(void)kernel_avx2;
+		(void) kernel_sse2;
+		(void) kernel_sse3;
+		(void) kernel_sse41;
+		(void) kernel_avx;
+		(void) kernel_avx2;
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
 			architecture_name = "AVX2";
@@ -184,11 +185,11 @@ public:
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
 	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
 
-	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
-	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
-	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
-	KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)>   filter_nlm_calc_difference_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                   filter_nlm_blur_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                   filter_nlm_calc_weight_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+	KernelFunctions<void(*)(float*, float*, int*, int)>                                        filter_nlm_normalize_kernel;
 
 	KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
 	KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -277,6 +278,20 @@ public:
 		return (info.cpu_threads == 1);
 	}
 
+	virtual BVHLayoutMask get_bvh_layout_mask() const {
+		BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+		if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+			bvh_layout_mask |= BVH_LAYOUT_BVH4;
+		}
+		if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+			bvh_layout_mask |= BVH_LAYOUT_BVH8;
+		}
+#ifdef WITH_EMBREE
+		bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif  /* WITH_EMBREE */
+		return bvh_layout_mask;
+	}
+
 	void load_texture_info()
 	{
 		if(need_texture_info) {
@@ -499,6 +514,7 @@ public:
 			filter_nlm_update_output_kernel()(dx, dy,
 			                                  blurDifference,
 			                                  (float*) image_ptr,
+			                                  difference,
 			                                  (float*) out_ptr,
 			                                  weightAccum,
 			                                  local_rect,
@@ -676,12 +692,22 @@ public:
 
 	void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
 	{
+		const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
 		scoped_timer timer(&tile.buffers->render_time);
 
+		Coverage coverage(kg, tile);
+		if(use_coverage) {
+			coverage.init_path_trace();
+		}
+
 		float *render_buffer = (float*)tile.buffer;
 		int start_sample = tile.start_sample;
 		int end_sample = tile.start_sample + tile.num_samples;
 
+		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+		_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+		
 		for(int sample = start_sample; sample < end_sample; sample++) {
 			if(task.get_cancel() || task_pool.canceled()) {
 				if(task.need_finish_queue == false)
@@ -690,6 +716,9 @@ public:
 
 			for(int y = tile.y; y < tile.y + tile.h; y++) {
 				for(int x = tile.x; x < tile.x + tile.w; x++) {
+					if(use_coverage) {
+						coverage.init_pixel(x, y);
+					}
 					path_trace_kernel()(kg, render_buffer,
 					                    sample, x, y, tile.offset, tile.stride);
 				}
@@ -699,6 +728,9 @@ public:
 
 			task.update_progress(&tile, tile.w*tile.h);
 		}
+		if(use_coverage) {
+			coverage.finalize();
+		}
 	}
 
 	void denoise(DenoisingTask& denoising, RenderTile &tile)
@@ -759,7 +791,6 @@ public:
 			}
 			else if(tile.task == RenderTile::DENOISE) {
 				denoise(denoising, tile);
-
 				task.update_progress(&tile, tile.w*tile.h);
 			}
 
@@ -1027,13 +1058,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.id = "CPU";
 	info.num = 0;
 	info.advanced_shading = true;
-	info.bvh_layout_mask = BVH_LAYOUT_BVH2;
-	if(system_cpu_support_sse2()) {
-		info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
-	}
-	if(system_cpu_support_avx2()) {
-		info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
-	}
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
@@ -1041,7 +1065,7 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	devices.insert(devices.begin(), info);
 }
 
-string device_cpu_capabilities(void)
+string device_cpu_capabilities()
 {
 	string capabilities = "";
 	capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index da8e49f129f..46e7b043603 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -73,12 +73,12 @@ const char *cuewErrorString(CUresult result)
 	return error.c_str();
 }
 
-const char *cuewCompilerPath(void)
+const char *cuewCompilerPath()
 {
 	return CYCLES_CUDA_NVCC_EXECUTABLE;
 }
 
-int cuewCompilerVersion(void)
+int cuewCompilerVersion()
 {
 	return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
 }
@@ -181,6 +181,10 @@ public:
 		return true;
 	}
 
+	virtual BVHLayoutMask get_bvh_layout_mask() const {
+		return BVH_LAYOUT_BVH2;
+	}
+
 /*#ifdef NDEBUG
 #define cuda_abort()
 #else
@@ -207,7 +211,7 @@ public:
 			/*cuda_abort();*/ \
 			cuda_error_documentation(); \
 		} \
-	} (void)0
+	} (void) 0
 
 	bool cuda_error_(CUresult result, const string& stmt)
 	{
@@ -1397,18 +1401,14 @@ public:
 		int h = task->reconstruction_state.source_h;
 		int stride = task->buffer.stride;
 
-		int shift_stride = stride*h;
+		int pass_stride = task->buffer.pass_stride;
 		int num_shifts = (2*r+1)*(2*r+1);
-		int mem_size = sizeof(float)*shift_stride*num_shifts;
-
-		device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem");
-		temporary_mem.alloc_to_device(2*mem_size);
 
 		if(have_error())
 			return false;
 
-		CUdeviceptr difference     = cuda_device_ptr(temporary_mem.device_pointer);
-		CUdeviceptr blurDifference = difference + mem_size;
+		CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+		CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
 
 		{
 			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
@@ -1426,9 +1426,9 @@ public:
 			                     task->reconstruction_state.source_w * task->reconstruction_state.source_h,
 			                     num_shifts);
 
-			void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
-			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
-			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+			void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
+			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
 			void *construct_gramian_args[] = {&blurDifference,
 			                                  &task->buffer.mem.device_pointer,
 			                                  &task->storage.transform.device_pointer,
@@ -1437,9 +1437,8 @@ public:
 			                                  &task->storage.XtWY.device_pointer,
 			                                  &task->reconstruction_state.filter_window,
 			                                  &w, &h, &stride,
-			                                  &shift_stride, &r,
-			                                  &f,
-		                                      &task->buffer.pass_stride};
+			                                  &pass_stride, &r,
+			                                  &f};
 
 			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
 			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1448,8 +1447,6 @@ public:
 			CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
 		}
 
-		temporary_mem.free();
-
 		{
 			CUfunction cuFinalize;
 			cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
@@ -1667,7 +1664,7 @@ public:
 		for(int sample = start_sample; sample < end_sample; sample += step_samples) {
 			/* Setup and copy work tile to device. */
 			wtile->start_sample = sample;
-			wtile->num_samples = min(step_samples, end_sample - sample);;
+			wtile->num_samples = min(step_samples, end_sample - sample);
 			work_tiles.copy_to_device();
 
 			CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
@@ -2149,7 +2146,7 @@ public:
 			/*cuda_abort();*/ \
 			device->cuda_error_documentation(); \
 		} \
-	} (void)0
+	} (void) 0
 
 
 /* CUDA context scope. */
@@ -2358,7 +2355,7 @@ int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory&
 	return global_size;
 }
 
-bool device_cuda_init(void)
+bool device_cuda_init()
 {
 #ifdef WITH_CUDA_DYNLOAD
 	static bool initialized = false;
@@ -2396,7 +2393,7 @@ bool device_cuda_init(void)
 	return result;
 #else  /* WITH_CUDA_DYNLOAD */
 	return true;
-#endif /* WITH_CUDA_DYNLOAD */
+#endif  /* WITH_CUDA_DYNLOAD */
 }
 
 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
@@ -2466,7 +2463,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		info.advanced_shading = (major >= 3);
 		info.has_half_images = (major >= 3);
 		info.has_volume_decoupled = false;
-		info.bvh_layout_mask = BVH_LAYOUT_BVH2;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
@@ -2501,7 +2497,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		devices.insert(devices.end(), display_devices.begin(), display_devices.end());
 }
 
-string device_cuda_capabilities(void)
+string device_cuda_capabilities()
 {
 	CUresult result = device_cuda_safe_init();
 	if(result != CUDA_SUCCESS) {
@@ -2534,7 +2530,7 @@ string device_cuda_capabilities(void)
 				capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
 				                              value); \
 			} \
-		} (void)0
+		} (void) 0
 		/* TODO(sergey): Strip all attributes which are not useful for us
 		 * or does not depend on the driver.
 		 */
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 23c18fa15b2..78c65a3d22d 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer()
 	buffer.mem.alloc_to_device(mem_size, false);
 
 	/* CPUs process shifts sequentially while GPUs process them in parallel. */
-	int num_shifts = 1;
+	int num_layers;
 	if(buffer.gpu_temporary_mem) {
 		/* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
 		int max_radius = max(radius, 6);
-		num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
+		int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
+		num_layers = 2*num_shifts + 1;
+	}
+	else {
+		num_layers = 3;
 	}
 	/* Allocate two layers per shift as well as one for the weight accumulation. */
-	buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride);
+	buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
 }
 
 void DenoisingTask::prefilter_shadowing()
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 7474f71ff78..8e0666d0e59 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -166,4 +166,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __DEVICE_DENOISING_H__ */
+#endif  /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 941be448101..e6495c2bff3 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -22,9 +22,9 @@ CCL_NAMESPACE_BEGIN
 class Device;
 
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background);
-bool device_opencl_init(void);
+bool device_opencl_init();
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background);
-bool device_cuda_init(void);
+bool device_cuda_init();
 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background);
 Device *device_network_create(DeviceInfo& info, Stats &stats, const char *address);
 Device *device_multi_create(DeviceInfo& info, Stats &stats, bool background);
@@ -34,10 +34,10 @@ void device_opencl_info(vector<DeviceInfo>& devices);
 void device_cuda_info(vector<DeviceInfo>& devices);
 void device_network_info(vector<DeviceInfo>& devices);
 
-string device_cpu_capabilities(void);
-string device_opencl_capabilities(void);
-string device_cuda_capabilities(void);
+string device_cpu_capabilities();
+string device_opencl_capabilities();
+string device_cuda_capabilities();
 
 CCL_NAMESPACE_END
 
-#endif /* __DEVICE_INTERN_H__ */
+#endif  /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 2b4835c9c65..e43834bdc8d 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -21,6 +21,7 @@
  *
  * Data types for allocating, copying and freeing device memory. */
 
+#include "util/util_array.h"
 #include "util/util_half.h"
 #include "util/util_texture.h"
 #include "util/util_types.h"
@@ -496,4 +497,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __DEVICE_MEMORY_H__ */
+#endif  /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index f1bd3fd13e1..490ee3951c9 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -103,6 +103,14 @@ public:
 		return devices.front().device->show_samples();
 	}
 
+	virtual BVHLayoutMask get_bvh_layout_mask() const {
+		BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+		foreach(const SubDevice& sub_device, devices) {
+			bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
+		}
+		return bvh_layout_mask;
+	}
+
 	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		foreach(SubDevice& sub, devices)
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 204e405421d..b6e18621f12 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -87,6 +87,10 @@ public:
 		snd.write();
 	}
 
+	virtual BVHLayoutMask get_bvh_layout_mask() const {
+		return BVH_LAYOUT_BVH2;
+	}
+
 	void mem_alloc(device_memory& mem)
 	{
 		if(mem.name) {
@@ -306,7 +310,6 @@ void device_network_info(vector<DeviceInfo>& devices)
 	/* todo: get this info from device */
 	info.advanced_shading = true;
 	info.has_volume_decoupled = false;
-	info.bvh_layout_mask = BVH_LAYOUT_BVH2;
 	info.has_osl = false;
 
 	devices.push_back(info);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 96e0de742db..67626ae177f 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -488,4 +488,4 @@ CCL_NAMESPACE_END
 
 #endif
 
-#endif /* __DEVICE_NETWORK_H__ */
+#endif  /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index be0f8f45399..71410f80d57 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -44,7 +44,7 @@ Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
 	}
 }
 
-bool device_opencl_init(void)
+bool device_opencl_init()
 {
 	static bool initialized = false;
 	static bool result = false;
@@ -136,7 +136,6 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
 		                                                     device_type);
 		info.has_volume_decoupled = false;
-		info.bvh_layout_mask = BVH_LAYOUT_BVH2;
 		info.id = id;
 
 		/* Check OpenCL extensions */
@@ -147,7 +146,7 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 	}
 }
 
-string device_opencl_capabilities(void)
+string device_opencl_capabilities()
 {
 	if(OpenCLInfo::device_type() == 0) {
 		return "All OpenCL devices are forced to be OFF";
@@ -246,4 +245,4 @@ string device_opencl_capabilities(void)
 
 CCL_NAMESPACE_END
 
-#endif /* WITH_OPENCL */
+#endif  /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 26ddce5bb22..5af4367d1b6 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -130,4 +130,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+#endif  /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index ec87aa8c560..861014373b3 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -64,7 +64,7 @@ public:
 	function<void(long, int)> update_progress_sample;
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
-	function<bool(void)> get_cancel;
+	function<bool()> get_cancel;
 	function<void(RenderTile*, Device*)> map_neighbor_tiles;
 	function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
 
@@ -85,4 +85,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __DEVICE_TASK_H__ */
+#endif  /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 6c73d10a376..8cb7f6d0b82 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -245,7 +245,7 @@ public:
 				(device)->set_error(message); \
 			fprintf(stderr, "%s\n", message.c_str()); \
 		} \
-	} (void)0
+	} (void) 0
 
 #define opencl_assert(stmt) \
 	{ \
@@ -257,7 +257,7 @@ public:
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
 		} \
-	} (void)0
+	} (void) 0
 
 class OpenCLDeviceBase : public Device
 {
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index cc887134bb0..1e73d37d7a4 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -761,7 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
 	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
 	cl_mem out_mem = CL_MEM_PTR(out_ptr);
 
-	mem_zero_kernel(*difference, sizeof(float)*pass_stride);
+	mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
 	mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
 
 	cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
@@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 	int h = task->reconstruction_state.source_h;
 	int stride = task->buffer.stride;
 
-	int shift_stride = stride*h;
-	int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
-	int mem_size = sizeof(float)*shift_stride*num_shifts;
+	int r = task->radius;
+	int pass_stride = task->buffer.pass_stride;
+	int num_shifts = (2*r+1)*(2*r+1);
 
-	cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
-	opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
-	cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
-	opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+	device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
+	device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
+	cl_mem difference_mem = CL_MEM_PTR(*difference);
+	cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
 
 	kernel_set_args(ckNLMCalcDifference, 0,
 	                color_mem,
 	                color_variance_mem,
-	                difference,
+	                difference_mem,
 	                w, h, stride,
-	                shift_stride,
-	                task->radius,
-	                task->buffer.pass_stride,
+	                pass_stride,
+	                r,
+	                pass_stride,
 	                1.0f, task->nlm_k_2);
 	kernel_set_args(ckNLMBlur, 0,
-	                difference,
-	                blurDifference,
+	                difference_mem,
+	                blurDifference_mem,
 	                w, h, stride,
-	                shift_stride,
-	                task->radius, 4);
+	                pass_stride,
+	                r, 4);
 	kernel_set_args(ckNLMCalcWeight, 0,
-	                blurDifference,
-	                difference,
+	                blurDifference_mem,
+	                difference_mem,
 	                w, h, stride,
-	                shift_stride,
-	                task->radius, 4);
+	                pass_stride,
+	                r, 4);
 	kernel_set_args(ckNLMConstructGramian, 0,
-	                blurDifference,
+	                blurDifference_mem,
 	                buffer_mem,
 	                transform_mem,
 	                rank_mem,
@@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 	                XtWY_mem,
 	                task->reconstruction_state.filter_window,
 	                w, h, stride,
-	                shift_stride,
-	                task->radius, 4,
-	                task->buffer.pass_stride);
+	                pass_stride,
+	                r, 4);
 
 	enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
 	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
@@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
 	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
 	enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
 
-	opencl_assert(clReleaseMemObject(difference));
-	opencl_assert(clReleaseMemObject(blurDifference));
-
 	kernel_set_args(ckFinalize, 0,
 	                output_mem,
 	                rank_mem,
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index e004c0b44f4..89001366d9d 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -43,6 +43,10 @@ public:
 		return true;
 	}
 
+	virtual BVHLayoutMask get_bvh_layout_mask() const {
+		return BVH_LAYOUT_BVH2;
+	}
+
 	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 66a4aa7e891..adb73bc6e2c 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -95,6 +95,10 @@ public:
 		return true;
 	}
 
+	virtual BVHLayoutMask get_bvh_layout_mask() const {
+		return BVH_LAYOUT_BVH2;
+	}
+
 	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
 	{
@@ -459,4 +463,4 @@ Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool backgrou
 
 CCL_NAMESPACE_END
 
-#endif /* WITH_OPENCL */
+#endif  /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 895e4149a3a..4c9f3cd6ef7 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -106,7 +106,7 @@ cl_context OpenCLCache::get_context(cl_platform_id platform,
 
 	cl_int ciErr = clRetainContext(slot.context);
 	assert(ciErr == CL_SUCCESS);
-	(void)ciErr;
+	(void) ciErr;
 
 	return slot.context;
 }
@@ -153,7 +153,7 @@ cl_program OpenCLCache::get_program(cl_platform_id platform,
 
 	cl_int ciErr = clRetainProgram(entry.program);
 	assert(ciErr == CL_SUCCESS);
-	(void)ciErr;
+	(void) ciErr;
 
 	return entry.program;
 }
@@ -188,7 +188,7 @@ void OpenCLCache::store_context(cl_platform_id platform,
 	 * The caller is going to release the object when done with it. */
 	cl_int ciErr = clRetainContext(context);
 	assert(ciErr == CL_SUCCESS);
-	(void)ciErr;
+	(void) ciErr;
 }
 
 void OpenCLCache::store_program(cl_platform_id platform,
@@ -227,7 +227,7 @@ void OpenCLCache::store_program(cl_platform_id platform,
 	 */
 	cl_int ciErr = clRetainProgram(program);
 	assert(ciErr == CL_SUCCESS);
-	(void)ciErr;
+	(void) ciErr;
 }
 
 string OpenCLCache::get_kernel_md5()
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index 11695a8631d..d50a3786139 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -18,9 +18,9 @@
 
 #include "graph/node_type.h"
 
+#include "util/util_array.h"
 #include "util/util_map.h"
 #include "util/util_param.h"
-#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h
index 1d565794b27..7d6abae2314 100644
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "graph/node_enum.h"
-
+#include "util/util_array.h"
 #include "util/util_map.h"
 #include "util/util_param.h"
 #include "util/util_string.h"
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index c6e92c6d89d..92cb66bdec9 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -82,6 +82,7 @@ set(SRC_BVH_HEADERS
 	bvh/obvh_traversal.h
 	bvh/obvh_volume.h
 	bvh/obvh_volume_all.h
+	bvh/bvh_embree.h
 )
 
 set(SRC_HEADERS
@@ -96,6 +97,7 @@ set(SRC_HEADERS
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
+	kernel_id_passes.h
 	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
@@ -340,11 +342,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90")
+	if(CUDA_VERSION MATCHES "90" OR CUDA_VERSION MATCHES "91")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-			"build may succeed but only CUDA 8.0 is officially supported")
+			"build may succeed but only CUDA 9.0 and 9.1 are officially supported")
 	endif()
 
 	# build for each arch
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 2ad55d041bf..6708a3efac1 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,10 @@
  * the code has been extended and modified to support more primitives and work
  * with CPU/CUDA/OpenCL. */
 
+#ifdef __EMBREE__
+#  include "kernel/bvh/bvh_embree.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 #include "kernel/bvh/bvh_types.h"
@@ -32,9 +36,9 @@ CCL_NAMESPACE_BEGIN
 /* Common QBVH functions. */
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_nodes.h"
-#ifdef __KERNEL_AVX2__
-#  include "kernel/bvh/obvh_nodes.h"
-#endif
+#  ifdef __KERNEL_AVX2__
+#    include "kernel/bvh/obvh_nodes.h"
+#  endif
 #endif
 
 /* Regular BVH traversal */
@@ -160,6 +164,19 @@ CCL_NAMESPACE_BEGIN
 #undef BVH_NAME_EVAL
 #undef BVH_FUNCTION_FULL_NAME
 
+ccl_device_inline bool scene_intersect_valid(const Ray *ray)
+{
+	/* NOTE: Due to some vectorization code  non-finite origin point might
+	 * cause lots of false-positive intersections which will overflow traversal
+	 * stack.
+	 * This code is a quick way to perform early output, to avoid crashes in
+	 * such cases.
+	 * From production scenes so far it seems it's enough to test first element
+	 * only.
+	 */
+	return isfinite(ray->P.x);
+}
+
 /* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           const Ray ray,
@@ -169,39 +186,57 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           float difl,
                                           float extmax)
 {
+	if(!scene_intersect_valid(&ray)) {
+		return false;
+	}
+#ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		isect->t = ray.t;
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
+		IntersectContext rtc_ctx(&ctx);
+		RTCRayHit ray_hit;
+		kernel_embree_setup_rayhit(ray, ray_hit, visibility);
+		rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit);
+		if(ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID && ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) {
+			kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
+			return true;
+		}
+		return false;
+	}
+#endif  /* __EMBREE__ */
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #  ifdef __HAIR__
 		if(kernel_data.bvh.have_curves)
 			return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
-#  endif /* __HAIR__ */
+#  endif  /* __HAIR__ */
 
 		return bvh_intersect_motion(kg, &ray, isect, visibility);
 	}
-#endif /* __OBJECT_MOTION__ */
+#endif  /* __OBJECT_MOTION__ */
 
 #ifdef __HAIR__
 	if(kernel_data.bvh.have_curves)
 		return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_CPU__
 
 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_instancing(kg, &ray, isect, visibility);
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 
 	return bvh_intersect(kg, &ray, isect, visibility);
-#else /* __KERNEL_CPU__ */
+#else  /* __KERNEL_CPU__ */
 
 #  ifdef __INSTANCING__
 	return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  else
 	return bvh_intersect(kg, &ray, isect, visibility);
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 
-#endif /* __KERNEL_CPU__ */
+#endif  /* __KERNEL_CPU__ */
 }
 
 #ifdef __BVH_LOCAL__
@@ -213,6 +248,58 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
+	if(!scene_intersect_valid(&ray)) {
+		return false;
+	}
+#ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS);
+		ctx.lcg_state = lcg_state;
+		ctx.max_hits = max_hits;
+		ctx.ss_isect = local_isect;
+		local_isect->num_hits = 0;
+		ctx.sss_object_id = local_object;
+		IntersectContext rtc_ctx(&ctx);
+		RTCRay rtc_ray;
+		kernel_embree_setup_ray(ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
+
+		/* Get the Embree scene for this intersection. */
+		RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2);
+		if(geom) {
+			float3 P = ray.P;
+			float3 dir = ray.D;
+			float3 idir = ray.D;
+			const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+			if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				Transform ob_itfm;
+				rtc_ray.tfar = bvh_instance_motion_push(kg,
+				                                        local_object,
+				                                        &ray,
+				                                        &P,
+				                                        &dir,
+				                                        &idir,
+				                                        ray.t,
+				                                        &ob_itfm);
+				/* bvh_instance_motion_push() returns the inverse transform but
+				 * it's not needed here. */
+				(void) ob_itfm;
+
+				rtc_ray.org_x = P.x;
+				rtc_ray.org_y = P.y;
+				rtc_ray.org_z = P.z;
+				rtc_ray.dir_x = dir.x;
+				rtc_ray.dir_y = dir.y;
+				rtc_ray.dir_z = dir.z;
+			}
+			RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
+			if(scene) {
+				rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
+			}
+		}
+
+		return local_isect->num_hits > 0;
+	}
+#endif  /* __EMBREE__ */
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_local_motion(kg,
@@ -222,7 +309,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 		                                  lcg_state,
 		                                  max_hits);
 	}
-#endif /* __OBJECT_MOTION__ */
+#endif  /* __OBJECT_MOTION__ */
 	return bvh_intersect_local(kg,
 	                            &ray,
 	                            local_isect,
@@ -240,6 +327,27 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
+	if(!scene_intersect_valid(ray)) {
+		return false;
+	}
+#  ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
+		ctx.isect_s = isect;
+		ctx.max_hits = max_hits;
+		ctx.num_hits = 0;
+		IntersectContext rtc_ctx(&ctx);
+		RTCRay rtc_ray;
+		kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW);
+		rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+
+		if(ctx.num_hits > max_hits) {
+			return true;
+		}
+		*num_hits = ctx.num_hits;
+		return rtc_ray.tfar == -INFINITY;
+	}
+#  endif
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
@@ -251,7 +359,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 			                                            max_hits,
 			                                            num_hits);
 		}
-#    endif /* __HAIR__ */
+#    endif  /* __HAIR__ */
 
 		return bvh_intersect_shadow_all_motion(kg,
 		                                       ray,
@@ -260,7 +368,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		                                       max_hits,
 		                                       num_hits);
 	}
-#  endif /* __OBJECT_MOTION__ */
+#  endif  /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
 	if(kernel_data.bvh.have_curves) {
@@ -271,7 +379,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		                                     max_hits,
 		                                     num_hits);
 	}
-#  endif /* __HAIR__ */
+#  endif  /* __HAIR__ */
 
 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing) {
@@ -282,7 +390,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		                                           max_hits,
 		                                           num_hits);
 	}
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 
 	return bvh_intersect_shadow_all(kg,
 	                                ray,
@@ -299,24 +407,27 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
+	if(!scene_intersect_valid(ray)) {
+		return false;
+	}
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_volume_motion(kg, ray, isect, visibility);
 	}
-#  endif /* __OBJECT_MOTION__ */
+#  endif  /* __OBJECT_MOTION__ */
 #  ifdef __KERNEL_CPU__
 #    ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
+#    endif  /* __INSTANCING__ */
 	return bvh_intersect_volume(kg, ray, isect, visibility);
-#  else /* __KERNEL_CPU__ */
+#  else  /* __KERNEL_CPU__ */
 #    ifdef __INSTANCING__
 	return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
 #    else
 	return bvh_intersect_volume(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-#  endif /* __KERNEL_CPU__ */
+#    endif  /* __INSTANCING__ */
+#  endif  /* __KERNEL_CPU__ */
 }
 #endif  /* __VOLUME__ */
 
@@ -327,15 +438,31 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
+	if(!scene_intersect_valid(ray)) {
+		return false;
+	}
+#  ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
+		ctx.isect_s = isect;
+		ctx.max_hits = max_hits;
+		ctx.num_hits = 0;
+		IntersectContext rtc_ctx(&ctx);
+		RTCRay rtc_ray;
+		kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+		rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+		return rtc_ray.tfar == -INFINITY;
+	}
+#  endif
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
 	}
-#  endif /* __OBJECT_MOTION__ */
+#  endif  /* __OBJECT_MOTION__ */
 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 	return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
 }
 #endif  /* __VOLUME_RECORD_ALL__ */
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
new file mode 100644
index 00000000000..34a099ebb4d
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2018, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <embree3/rtcore_ray.h>
+#include <embree3/rtcore_scene.h>
+
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct CCLIntersectContext  {
+	typedef enum {
+		RAY_REGULAR = 0,
+		RAY_SHADOW_ALL = 1,
+		RAY_SSS = 2,
+		RAY_VOLUME_ALL = 3,
+		
+	} RayType;
+
+	KernelGlobals *kg;
+	RayType type;
+
+	/* for shadow rays */
+	Intersection *isect_s;
+	int max_hits;
+	int num_hits;
+
+	/* for SSS Rays: */
+	LocalIntersection *ss_isect;
+	int sss_object_id;
+	uint *lcg_state;
+
+	CCLIntersectContext(KernelGlobals *kg_,  RayType type_)
+	{
+		kg = kg_;
+		type = type_;
+		max_hits = 1;
+		num_hits = 0;
+		isect_s = NULL;
+		ss_isect = NULL;
+		sss_object_id = -1;
+		lcg_state = NULL;
+	}
+};
+
+class IntersectContext
+{
+public:
+	IntersectContext(CCLIntersectContext* ctx)
+	{
+		rtcInitIntersectContext(&context);
+		userRayExt = ctx;
+	}
+	RTCIntersectContext context;
+	CCLIntersectContext* userRayExt;
+};
+
+ccl_device_inline void kernel_embree_setup_ray(const Ray& ray, RTCRay& rtc_ray, const uint visibility)
+{
+	rtc_ray.org_x = ray.P.x;
+	rtc_ray.org_y = ray.P.y;
+	rtc_ray.org_z = ray.P.z;
+	rtc_ray.dir_x = ray.D.x;
+	rtc_ray.dir_y = ray.D.y;
+	rtc_ray.dir_z = ray.D.z;
+	rtc_ray.tnear = 0.0f;
+	rtc_ray.tfar = ray.t;
+	rtc_ray.time = ray.time;
+	rtc_ray.mask = visibility;
+}
+
+ccl_device_inline void kernel_embree_setup_rayhit(const Ray& ray, RTCRayHit& rayhit, const uint visibility)
+{
+	kernel_embree_setup_ray(ray, rayhit.ray, visibility);
+	rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+	rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
+}
+
+ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect)
+{
+	bool is_hair = hit->geomID & 1;
+	isect->u = is_hair ? hit->u : 1.0f - hit->v - hit->u;
+	isect->v = is_hair ? hit->v : hit->u;
+	isect->t = ray->tfar;
+	isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z);
+	if(hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+		RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
+		isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, hit->instID[0]/2);
+		isect->object = hit->instID[0]/2;
+	}
+	else {
+		isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
+		isect->object = OBJECT_NONE;
+	}
+	isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+}
+
+ccl_device_inline void kernel_embree_convert_local_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int local_object_id)
+{
+	isect->u = 1.0f - hit->v - hit->u;
+	isect->v = hit->u;
+	isect->t = ray->tfar;
+	isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z);
+	RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, local_object_id * 2));
+	isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, local_object_id);
+	isect->object = local_object_id;
+	isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 2b02f4527bb..8364bc3aa9a 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -136,7 +136,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               PATH_RAY_ALL_VISIBILITY,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -151,7 +151,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               PATH_RAY_ALL_VISIBILITY,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index d525b29fd94..64eb2f3f659 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -124,7 +124,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -139,7 +139,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index e95d2408201..af9f04db0ba 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -146,7 +146,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					                               visibility,
 					                               dist);
 				}
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 #  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
 					traverse_mask = NODE_INTERSECT_ROBUST(kg,
@@ -184,7 +184,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					                               visibility,
 					                               dist);
 				}
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 7d03855cb8f..12d4c5eb94a 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -120,7 +120,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -135,7 +135,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 3d9b598914f..6205b9bcf7a 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -124,7 +124,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -139,7 +139,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
index 92143193a6a..eb24a607caa 100644
--- a/intern/cycles/kernel/bvh/obvh_local.h
+++ b/intern/cycles/kernel/bvh/obvh_local.h
@@ -73,12 +73,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 		object = local_object;
 	}
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	avxf tnear(0.0f), tfar(isect_t);
 #if BVH_FEATURE(BVH_HAIR)
 	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
index 3e877065127..8b739b3438a 100644
--- a/intern/cycles/kernel/bvh/obvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h
@@ -66,12 +66,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	*num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
@@ -103,7 +97,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
index 2021d8e1143..6bb19eb1ed9 100644
--- a/intern/cycles/kernel/bvh/obvh_traversal.h
+++ b/intern/cycles/kernel/bvh/obvh_traversal.h
@@ -64,12 +64,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
@@ -103,7 +97,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -179,7 +173,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 					avxf cnodes;
 					/* TODO(sergey): Investigate whether moving cnodes upwards
 					 * gives a speedup (will be different cache pattern but will
-					 * avoid extra check here),
+					 * avoid extra check here).
 					 */
 #if BVH_FEATURE(BVH_HAIR)
 					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
index da9ddbd4f24..80d09c59039 100644
--- a/intern/cycles/kernel/bvh/obvh_volume.h
+++ b/intern/cycles/kernel/bvh/obvh_volume.h
@@ -52,12 +52,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
index a88573e6f86..87216127ddb 100644
--- a/intern/cycles/kernel/bvh/obvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/obvh_volume_all.h
@@ -58,12 +58,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	uint num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return 0;
-	}
-#endif
-
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h
index ee3827de309..22d434a8737 100644
--- a/intern/cycles/kernel/bvh/qbvh_local.h
+++ b/intern/cycles/kernel/bvh/qbvh_local.h
@@ -82,12 +82,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		object = local_object;
 	}
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	ssef tnear(0.0f), tfar(isect_t);
 #if BVH_FEATURE(BVH_HAIR)
 	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 46fd178aed6..37606e10b92 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -66,11 +66,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	*num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
 
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
@@ -103,7 +98,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index 335a4afd47a..35c6e3aeec9 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -71,12 +71,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
@@ -112,7 +106,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -188,7 +182,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					float4 cnodes;
 					/* TODO(sergey): Investigate whether moving cnodes upwards
 					 * gives a speedup (will be different cache pattern but will
-					 * avoid extra check here),
+					 * avoid extra check here).
 					 */
 #if BVH_FEATURE(BVH_HAIR)
 					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index 192ce009524..7ec264e5f78 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -58,12 +58,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index 1e454e4d36b..dd603d79334 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -64,12 +64,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	uint num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return 0;
-	}
-#endif
-
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index ff238b7a834..4e7425bd800 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -232,4 +232,4 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
+#endif  /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index b0bdea723b9..80fd9ba2b37 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -158,4 +158,4 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
+#endif  /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ee6d4cdf2df..946c460a70e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -139,4 +139,4 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_DIFFUSE_H__ */
+#endif  /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 35bb2fdf0e8..ca33a5b275c 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -103,8 +103,8 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, float3 Ng, floa
 	return LABEL_REFLECT|LABEL_DIFFUSE;
 }
 
-#endif /* __OSL__ */
+#endif  /* __OSL__ */
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
+#endif  /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7b44a23f05b..e1a0cfaa3f5 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -277,4 +277,4 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_HAIR_H__ */
+#endif  /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index b3b56be39ff..68335ee887a 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -229,7 +229,7 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
-#endif /* __HAIR__ */
+#endif  /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
 ccl_device_inline void hair_attenuation(KernelGlobals *kg,
@@ -296,7 +296,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
 	float3 Y = float4_to_float3(bsdf->extra->geom);
 
 	float3 X = safe_normalize(sd->dPdu);
-	kernel_assert(fabsf(dot(X, Y)) < 1e-4f);
+	kernel_assert(fabsf(dot(X, Y)) < 1e-3f);
 	float3 Z = safe_normalize(cross(X, Y));
 
 	float3 wo = make_float3(dot(sd->I, X), dot(sd->I, Y), dot(sd->I, Z));
@@ -378,7 +378,7 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
 	float3 Y = float4_to_float3(bsdf->extra->geom);
 
 	float3 X = safe_normalize(sd->dPdu);
-	kernel_assert(fabsf(dot(X, Y)) < 1e-4f);
+	kernel_assert(fabsf(dot(X, Y)) < 1e-3f);
 	float3 Z = safe_normalize(cross(X, Y));
 
 	float3 wo = make_float3(dot(sd->I, X), dot(sd->I, Y), dot(sd->I, Z));
@@ -499,4 +499,4 @@ ccl_device void bsdf_principled_hair_blur(ShaderClosure *sc, float roughness)
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
+#endif  /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index e74d5ebaa42..32b6e50b09a 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -1124,4 +1124,4 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_MICROFACET_H__ */
+#endif  /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index e73915dbda7..5d300ef6db5 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -76,7 +76,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 		eval *= -lambda_r / (shadowing_lambda - lambda_r);
 	else
 		eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 	const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
 	float val = G2 * 0.25f / wi.z;
 	if(alpha.x == alpha.y)
@@ -129,7 +129,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 				phase = mf_eval_phase_glass(wr, lambda_r,  wo,  wo_outside, alpha, eta);
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
 #endif
 			eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
@@ -153,7 +153,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			else if(use_fresnel && order > 0) {
 				throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
 			}
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 			if(use_fresnel && order > 0) {
 				throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
 			}
@@ -248,7 +248,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
 					throughput *= t_color;
 			}
 		}
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 		if(use_fresnel) {
 			float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
 
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 6b770fc0c16..3446d1609d9 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -108,4 +108,4 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_OREN_NAYAR_H__ */
+#endif  /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 91c7803346d..83da05ac435 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -135,8 +135,8 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, float3 Ng, float3
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
-#endif /* __OSL__ */
+#endif  /* __OSL__ */
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_PHONG_RAMP_H__ */
+#endif  /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 83be2b35a00..2f65fd54be2 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -122,4 +122,4 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+#endif  /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 8b7c4399516..ccdcb1babd2 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -108,4 +108,4 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+#endif  /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index b33b6e3597b..94f1c283af7 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -77,4 +77,4 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_REFLECTION_H__ */
+#endif  /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index b181650e928..abdd01c7a1d 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -86,4 +86,4 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_REFRACTION_H__ */
+#endif  /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 6d8074b7130..097a56f22eb 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -215,4 +215,4 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_TOON_H__ */
+#endif  /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index f788dbcd0ff..060dff69f52 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -106,4 +106,4 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc, float3 Ng, float
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_TRANSPARENT_H__ */
+#endif  /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b080e025d16..4f3453675c7 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -158,4 +158,4 @@ ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, floa
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_UTIL_H__ */
+#endif  /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index ba0c6ae8c61..98c7f23c288 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -499,4 +499,4 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_BSSRDF_H__ */
+#endif  /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
index f6e474d6702..4209d69ee73 100644
--- a/intern/cycles/kernel/filter/filter.h
+++ b/intern/cycles/kernel/filter/filter.h
@@ -49,4 +49,4 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#endif /* __FILTER_H__ */
+#endif  /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index 1a2f22a6987..67f4e62ac0f 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -68,4 +68,4 @@ typedef struct TileInfo {
 #  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
 #endif
 
-#endif /* __FILTER_DEFINES_H__*/
+#endif  /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index e2da0fd872b..af73c0dadf2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -16,6 +16,9 @@
 
 CCL_NAMESPACE_BEGIN
 
+#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
+#define load4_u(buf, ofs) load_float4((buf)+(ofs))
+
 ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          const float *ccl_restrict weight_image,
                                                          const float *ccl_restrict variance_image,
@@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          float a,
                                                          float k_2)
 {
+	/* Strides need to be aligned to 16 bytes. */
+	kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
+
+	int aligned_lowx = rect.x & (~3);
+	const int numChannels = (channel_offset > 0)? 3 : 1;
+	const float4 channel_fac = make_float4(1.0f / numChannels);
+
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			float diff = 0.0f;
-			int numChannels = channel_offset? 3 : 1;
-			for(int c = 0; c < numChannels; c++) {
-				float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-				float pvar = variance_image[c*channel_offset + y*stride + x];
-				float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
-			}
-			if(numChannels > 1) {
-				diff *= 1.0f/numChannels;
+		int idx_p = y*stride + aligned_lowx;
+		int idx_q = (y+dy)*stride + aligned_lowx + dx;
+		for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
+			float4 diff = make_float4(0.0f);
+			for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
+				/* idx_p is guaranteed to be aligned, but idx_q isn't. */
+				float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
+				float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+				float4 cdiff = color_p - color_q;
+				float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
+				float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+				diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
 			}
-			difference_image[y*stride + x] = diff;
+			load4_a(difference_image, idx_p) = diff*channel_fac;
 		}
 	}
 }
@@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int stride,
                                               int f)
 {
-	int aligned_lowx = rect.x / 4;
-	int aligned_highx = (rect.z + 3) / 4;
+	int aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
 		const int low = max(rect.y, y-f);
 		const int high = min(rect.w, y+f+1);
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*stride + x] = 0.0f;
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) = make_float4(0.0f);
 		}
 		for(int y1 = low; y1 < high; y1++) {
-			float4* out_image4 = (float4*)(out_image + y*stride);
-			float4* difference_image4 = (float4*)(difference_image + y1*stride);
-			for(int x = aligned_lowx; x < aligned_highx; x++) {
-				out_image4[x] += difference_image4[x];
+			for(int x = aligned_lowx; x < rect.z; x += 4) {
+				load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
 			}
 		}
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*stride + x] *= 1.0f/(high - low);
+		float fac = 1.0f/(high - low);
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) *= fac;
 		}
 	}
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                                     float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
+ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
+                                           float *out_image,
+                                           int4 rect,
+                                           int stride,
+                                           int f)
 {
+	int aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*stride + x] = 0.0f;
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) = make_float4(0.0f);
 		}
 	}
+
 	for(int dx = -f; dx <= f; dx++) {
-		int pos_dx = max(0, dx);
-		int neg_dx = min(0, dx);
+		aligned_lowx = round_down(rect.x - min(0, dx), 4);
+		int highx = rect.z - max(0, dx);
+		int4 lowx4 = make_int4(rect.x - min(0, dx));
+		int4 highx4 = make_int4(rect.z - max(0, dx));
 		for(int y = rect.y; y < rect.w; y++) {
-			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
-				out_image[y*stride + x] += difference_image[y*stride + x+dx];
+			for(int x = aligned_lowx; x < highx; x += 4) {
+				int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+				int4 active = (x4 >= lowx4) & (x4 < highx4);
+
+				float4 diff = load4_u(difference_image, y*stride + x + dx);
+				load4_a(out_image, y*stride + x) += mask(active, diff);
 			}
 		}
 	}
+
+	aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			const int low = max(rect.x, x-f);
-			const int high = min(rect.z, x+f+1);
-			out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
+			float4 low = max(make_float4(rect.x), x4 - make_float4(f));
+			float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
+			load4_a(out_image, y*stride + x) *= rcp(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int stride,
+                                                     int f)
+{
+	nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
+
+	int aligned_lowx = round_down(rect.x, 4);
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
 		}
 	}
 }
@@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
 ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        const float *ccl_restrict difference_image,
                                                        const float *ccl_restrict image,
+                                                       float *temp_image,
                                                        float *out_image,
                                                        float *accum_image,
                                                        int4 rect,
                                                        int stride,
                                                        int f)
 {
+	nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
+
+	int aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			const int low = max(rect.x, x-f);
-			const int high = min(rect.z, x+f+1);
-			float sum = 0.0f;
-			for(int x1 = low; x1 < high; x1++) {
-				sum += difference_image[y*stride + x1];
-			}
-			float weight = sum * (1.0f/(high - low));
-			accum_image[y*stride + x] += weight;
-			out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+			int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
+
+			int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+
+			float4 weight = load4_a(temp_image, idx_p);
+			load4_a(accum_image, idx_p) += mask(active, weight);
+
+			float4 val = load4_u(image, idx_q);
+
+			load4_a(out_image, idx_p) += mask(active, weight*val);
 		}
 	}
 }
@@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
 	}
 }
 
+#undef load4_a
+#undef load4_u
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index 4cfbe21685c..b6b58b52a29 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -379,7 +379,7 @@ ccl_device_forceinline bool cardinal_curve_intersect(
 					float inv_mw_extension = 1.0f/mw_extension;
 					if(d0 >= 0)
 						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
-					else // inside
+					else  // inside
 						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
 				}
 
@@ -817,16 +817,24 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
 			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
-			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
-			sd->Ng = normalize(P - p_curr);
+#ifdef __EMBREE__
+ 			if(kernel_data.bvh.scene) {
+ 				sd->Ng = normalize(isect->Ng);
+ 			}
+ 			else
+#endif
+			{
+				/* direction from inside to surface of curve */
+				float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+				sd->Ng = normalize(P - p_curr);
 
-			/* adjustment for changing radius */
-			float gd = isect->v;
+				/* adjustment for changing radius */
+				float gd = isect->v;
 
-			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				if(gd != 0.0f) {
+					sd->Ng = sd->Ng - gd * tg;
+					sd->Ng = normalize(sd->Ng);
+				}
 			}
 		}
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index cfe17e63627..669c932d720 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -78,6 +78,12 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int
 	const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1;
 
 	Transform tfm;
+#ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		transform_motion_array_interpolate_straight(&tfm, motion, num_steps, time);
+	}
+	else
+#endif
 	transform_motion_array_interpolate(&tfm, motion, num_steps, time);
 
 	return tfm;
@@ -304,6 +310,24 @@ ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 	return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
+/* Cryptomatte ID */
+
+ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0.0f;
+
+	return kernel_tex_fetch(__objects, object).cryptomatte_object;
+}
+
+ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0;
+
+	return kernel_tex_fetch(__objects, object).cryptomatte_asset;
+}
+
 /* Particle data from which object was instanced */
 
 ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 00ce89ae567..8c0d0a9770e 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -146,7 +146,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		return a;
 	}
 	else
-#endif /* __PATCH_EVAL__ */
+#endif  /* __PATCH_EVAL__ */
 	if(desc.element == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
@@ -271,7 +271,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		return a;
 	}
 	else
-#endif /* __PATCH_EVAL__ */
+#endif  /* __PATCH_EVAL__ */
 	if(desc.element == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index aa6b102a0f3..57f4c86d403 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -71,28 +71,23 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 }
 
 #ifdef __KERNEL_AVX2__
-
 #define	cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
-#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
-ccl_device_inline
-#else
-ccl_device_forceinline
-#endif
-int ray_triangle_intersect8(KernelGlobals *kg,
-                            float3 ray_P,
-                            float3 ray_dir,
-                            Intersection **isect,
-                            uint visibility,
-                            int object,
-                            __m256 *triA,
-                            __m256 *triB,
-                            __m256 *triC,
-                            int prim_addr,
-                            int prim_num,
-                            uint *num_hits,
-                            uint max_hits,
-                            int *num_hits_in_instance,
-                            float isec_t)
+ccl_device_inline int ray_triangle_intersect8(
+            KernelGlobals *kg,
+            float3 ray_P,
+            float3 ray_dir,
+            Intersection **isect,
+            uint visibility,
+            int object,
+            __m256 *triA,
+            __m256 *triB,
+            __m256 *triC,
+            int prim_addr,
+            int prim_num,
+            uint *num_hits,
+            uint max_hits,
+            int *num_hits_in_instance,
+            float isect_t)
 {
 
 	const unsigned char prim_num_mask = (1 << prim_num) - 1;
@@ -108,10 +103,6 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
 
 	/* Calculate vertices relative to ray origin. */
-	/*	const float3 v0 = tri_c - P;
-	const float3 v1 = tri_a - P;
-	const float3 v2 = tri_b - P; */
-
 	__m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
 	__m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
 	__m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
@@ -136,11 +127,7 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	__m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
 	__m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
 
-	/* Calculate triangle edges.
-	const float3 e0 = v2 - v0;
-	const float3 e1 = v0 - v1;
-	const float3 e2 = v1 - v2;*/
-
+	/* Calculate triangle edges. */
 	__m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
 	__m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
 	__m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
@@ -153,48 +140,32 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	__m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
 	__m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
 
-	/* Perform edge tests.
-	const float U = dot(cross(v2 + v0, e0), ray_dir);
-	const float V = dot(cross(v0 + v1, e1), ray_dir);
-	const float W = dot(cross(v1 + v2, e2), ray_dir);*/
-
-	//cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx)
+	/* Perform edge tests. */
+	/* cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx) */
 	__m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
 	__m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
 	__m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
-	//vertical dot
+	/* vertical dot */
 	__m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
-	U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
-	U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
+	U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256);
+	U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256);
 
 	__m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
 	__m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
 	__m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
-	//vertical dot
+	/* vertical dot */
 	__m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
-	V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
-	V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
+	V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);
+	V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);
 
 	__m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
 	__m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
 	__m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
-	//vertical dot
+	/* vertical dot */
 	__m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
-	W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
-	W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
-
-	//const float minUVW = min(U, min(V, W));
-	//const float maxUVW = max(U, max(V, W));
-#if 0
-	__m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
-	__m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
-
-	//if(minUVW < 0.0f && maxUVW > 0.0f)
-	__m256i mask_minmaxUVW_256 = _mm256_and_si256(
-		_mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
-		//_mm256_castps_si256(minUVW_256),
-		_mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
-#else
+	W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);
+	W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);
+
 	__m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
 	__m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
 	__m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
@@ -204,9 +175,8 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	const __m256i two256 = _mm256_set1_epi32(2);
 
 	__m256i mask_minmaxUVW_256 = _mm256_or_si256(
-		_mm256_cmpeq_epi32(one256, UVW_256_1),
-		_mm256_cmpeq_epi32(two256, UVW_256_1) );
-#endif
+	        _mm256_cmpeq_epi32(one256, UVW_256_1),
+	        _mm256_cmpeq_epi32(two256, UVW_256_1));
 
 	unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
 	if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
@@ -214,231 +184,187 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	}
 
 	/* Calculate geometry normal and denominator. */
-	//			const float3 Ng1 = cross(e1, e0);
-	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
-
 	__m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
 	__m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
 	__m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
 
-	//const float3 Ng = Ng1 + Ng1;
 	Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
 	Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
 	Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
 
-	//const float den = dot3(Ng, dir);
-	//vertical dot
+	/* vertical dot */
 	__m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
-	den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
-	den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
-
-	// __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+	den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);
+	den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);
 
 	/* Perform depth test. */
-	//const float T = dot3(v0, Ng);
 	__m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
-	T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
-	T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
+	T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);
+	T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);
 
-	//const int sign_den = (__float_as_int(den) & 0x80000000);
 	const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
 	__m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
 
-	//const float sign_T = xor_signmask(T, sign_den);
 	__m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
 
-	/*if((sign_T < 0.0f) || mask_minmaxUVW_pos {	return false;}	*/
 	unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
 	if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
 		return false;
-	} /**/
+	} 
 
 	__m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
 
-
 	ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
 	ccl_align(32) unsigned int mask_minmaxUVW8[8];
 
-	if(visibility == PATH_RAY_SHADOW_OPAQUE){
-			__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
-
-			__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
-
-			__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
-			__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
-
-			__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
-				_mm256_castps_si256(
-					_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
-				)
-			);
-			/*	__m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
-			_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
-			_CMP_GT_OS
-			) );*/
-
-			mask0 = _mm256_or_si256(mask1, mask0);
-			//unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
-			//unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
-			//unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
-			mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
-			mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
-
-			unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
-			if((mask_final & prim_num_mask) == 0) { //all bits NOT set
-				return false;
-			}		/**/
-
-			unsigned long i = 0;
-#if defined(_MSC_VER)
-			unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
-#else
-            i = __builtin_ffs(mask_final)-1;
-#endif
-
-			den_256 = _mm256_rcp_ps(den_256); //inv_den
-			U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
-			V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
-			T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
-
-			_mm256_store_ps(U8, U_256);
-			_mm256_store_ps(V8, V_256);
-			_mm256_store_ps(T8, T_256);
-
-
-			//here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
-
-			(*isect)->u = U8[i];
-			(*isect)->v = V8[i];
-			(*isect)->t = T8[i];
-
-			(*isect)->prim = (prim_addr + i);
-			(*isect)->object = object;
-			(*isect)->type = PRIMITIVE_TRIANGLE;
-
-			return true;
+	if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+		__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);
+		__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+		__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
+		__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
+		__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
+			_mm256_castps_si256(
+				_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
+			)
+		);
+		mask0 = _mm256_or_si256(mask1, mask0);
+		mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
+		mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
+		unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
+		if((mask_final & prim_num_mask) == 0) {
+			return false;
 		}
+		const int i = __bsf(mask_final);
+		__m256 inv_den_256 = _mm256_rcp_ps(den_256);
+		U_256 = _mm256_mul_ps(U_256, inv_den_256);
+		V_256 = _mm256_mul_ps(V_256, inv_den_256);
+		T_256 = _mm256_mul_ps(T_256, inv_den_256);
+		_mm256_store_ps(U8, U_256);
+		_mm256_store_ps(V8, V_256);
+		_mm256_store_ps(T8, T_256);
+		/* NOTE: Here we assume visibility for all triangles in the node is
+		 * the same. */
+		(*isect)->u = U8[i];
+		(*isect)->v = V8[i];
+		(*isect)->t = T8[i];
+		(*isect)->prim = (prim_addr + i);
+		(*isect)->object = object;
+		(*isect)->type = PRIMITIVE_TRIANGLE;
+		return true;
+	}
 	else {
-			_mm256_store_ps(den8, den_256);
-			_mm256_store_ps(U8, U_256);
-			_mm256_store_ps(V8, V_256);
-			_mm256_store_ps(T8, T_256);
+		_mm256_store_ps(den8, den_256);
+		_mm256_store_ps(U8, U_256);
+		_mm256_store_ps(V8, V_256);
+		_mm256_store_ps(T8, T_256);
 
-			_mm256_store_ps(sign_T8, sign_T_256);
-			_mm256_store_ps(xor_signmask8, xor_signmask_256);
-			_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
+		_mm256_store_ps(sign_T8, sign_T_256);
+		_mm256_store_ps(xor_signmask8, xor_signmask_256);
+		_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
 
-			int ret = false;
+		int ret = false;
 
-			if(visibility == PATH_RAY_SHADOW) {
-				for(int i = 0; i < prim_num; i++) {
-					if(!mask_minmaxUVW8[i]) {
+		if(visibility == PATH_RAY_SHADOW) {
+			for(int i = 0; i < prim_num; i++) {
+				if(mask_minmaxUVW8[i]) {
+					continue;
+				}
 #ifdef __VISIBILITY_FLAG__
-						if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+				if((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
+					continue;
+				}
 #endif
-						{
-							if((sign_T8[i] >= 0.0f) &&
-							   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
-							{
-								if(den8[i]) {
-									const float inv_den = 1.0f / den8[i];
-
-									(*isect)->u = U8[i] * inv_den;
-									(*isect)->v = V8[i] * inv_den;
-									(*isect)->t = T8[i] * inv_den;
-
-									(*isect)->prim = (prim_addr + i);
-									(*isect)->object = object;
-									(*isect)->type = PRIMITIVE_TRIANGLE;
-
-									int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
-									int shader = 0;
-
+				if((sign_T8[i] < 0.0f) ||
+				   (sign_T8[i] > (*isect)->t * xor_signmask8[i]))
+				{
+					continue;
+				}
+				if(!den8[i]) {
+					continue;
+				}
+				const float inv_den = 1.0f / den8[i];
+				(*isect)->u = U8[i] * inv_den;
+				(*isect)->v = V8[i] * inv_den;
+				(*isect)->t = T8[i] * inv_den;
+				(*isect)->prim = (prim_addr + i);
+				(*isect)->object = object;
+				(*isect)->type = PRIMITIVE_TRIANGLE;
+				const int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
+				int shader = 0;
 #ifdef __HAIR__
-									if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
+				if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
-									{
-										shader = kernel_tex_fetch(__tri_shader, prim);
-									}
+				{
+					shader = kernel_tex_fetch(__tri_shader, prim);
+				}
 #ifdef __HAIR__
-									else {
-										float4 str = kernel_tex_fetch(__curves, prim);
-										shader = __float_as_int(str.z);
-									}
+				else {
+					float4 str = kernel_tex_fetch(__curves, prim);
+					shader = __float_as_int(str.z);
+				}
 #endif
-									int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-									/* if no transparent shadows, all light is blocked */
-									if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-										return 2;
-									}
-									/* if maximum number of hits reached, block all light */
-									else if(*num_hits == max_hits) {
-										return 2;
-									}
-									/* move on to next entry in intersections array */
-									ret = true;
-
-									(*isect)++;
-									(*num_hits)++;
-
-									(*num_hits_in_instance)++;
-
-									(*isect)->t = isec_t;
-
-								} //den
-							} //if sign
-						} //vis
-					}//if mask
-				} //for
+				const int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+				/* If no transparent shadows, all light is blocked. */
+				if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+					return 2;
+				}
+				/* If maximum number of hits reached, block all light. */
+				else if(num_hits == NULL || *num_hits == max_hits) {
+					return 2;
+				}
+				/* Move on to next entry in intersections array. */
+				ret = true;
+				(*isect)++;
+				(*num_hits)++;
+				(*num_hits_in_instance)++;
+				(*isect)->t = isect_t;
+			}
 		}
-		else { //default case
+		else {
 			for(int i = 0; i < prim_num; i++) {
-				if(!mask_minmaxUVW8[i]) {
+				if(mask_minmaxUVW8[i]) {
+					continue;
+				}
 #ifdef __VISIBILITY_FLAG__
-					if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+				if((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
+					continue;
+				}
 #endif
-					{
-						if((sign_T8[i] >= 0.0f) &&
-						   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
-						{
-							if(den8[i]) {
-								const float inv_den = 1.0f / den8[i];
-
-								(*isect)->u = U8[i] * inv_den;
-								(*isect)->v = V8[i] * inv_den;
-								(*isect)->t = T8[i] * inv_den;
-
-								(*isect)->prim = (prim_addr + i);
-								(*isect)->object = object;
-								(*isect)->type = PRIMITIVE_TRIANGLE;
-
-								ret = true;
-							} //den
-						} //if sign
-					} //vis
-				}//if mask
-			} //for
-		} //default
-	return ret;
-}// else PATH_RAY_SHADOW_OPAQUE
-
+				if((sign_T8[i] < 0.0f) ||
+				   (sign_T8[i] > (*isect)->t * xor_signmask8[i]))
+				{
+					continue;
+				}
+				if(!den8[i]) {
+					continue;
+				}
+				const float inv_den = 1.0f / den8[i];
+				(*isect)->u = U8[i] * inv_den;
+				(*isect)->v = V8[i] * inv_den;
+				(*isect)->t = T8[i] * inv_den;
+				(*isect)->prim = (prim_addr + i);
+				(*isect)->object = object;
+				(*isect)->type = PRIMITIVE_TRIANGLE;
+				ret = true;
+			}
+		}
+		return ret;
+	}
 }
 
-//vz static
-ccl_device_inline
-int triangle_intersect8(KernelGlobals *kg,
-                        Intersection **isect,
-                        float3 P,
-                        float3 dir,
-                        uint visibility,
-                        int object,
-                        int prim_addr,
-                        int prim_num,
-                        uint *num_hits,
-                        uint max_hits,
-                        int *num_hits_in_instance,
-                        float isec_t)
+ccl_device_inline int triangle_intersect8(
+        KernelGlobals *kg,
+        Intersection **isect,
+        float3 P,
+        float3 dir,
+        uint visibility,
+        int object,
+        int prim_addr,
+        int prim_num,
+        uint *num_hits,
+        uint max_hits,
+        int *num_hits_in_instance,
+        float isect_t)
  {
 	__m128 tri_a[8], tri_b[8], tri_c[8];
 	__m256  tritmp[12], tri[12];
@@ -540,11 +466,11 @@ int triangle_intersect8(KernelGlobals *kg,
 	                                     num_hits,
 	                                     max_hits,
 	                                     num_hits_in_instance,
-	                                     isec_t);
+	                                     isect_t);
 	return result;
 }
 
-#endif /* __KERNEL_AVX2__ */
+#endif  /* __KERNEL_AVX2__ */
 
 /* Special ray intersection routines for subsurface scattering. In that case we
  * only want to intersect with primitives in the same object, and if case of
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 373324afb01..1c8c91d15e6 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -63,4 +63,4 @@ void kernel_tex_copy(KernelGlobals *kg,
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_H__ */
+#endif  /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 990e798543a..ea478a8a5d3 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -35,4 +35,4 @@ ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_COLOR_H__ */
+#endif  /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index aa7a16afa1d..4ee80850402 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -153,4 +153,4 @@ typedef vector3<avxf> avx3f;
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_COMPAT_CPU_H__ */
+#endif  /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index ac63bcf7ac9..8ed96bbae64 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -150,4 +150,4 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
 
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+#endif  /* __KERNEL_COMPAT_CUDA_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 3f7e264fbee..21a95098894 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -159,4 +159,4 @@
 #include "util/util_half.h"
 #include "util/util_types.h"
 
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
+#endif  /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 74cfacb5bc1..37402f42863 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -21,6 +21,7 @@
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_vector.h"
+#  include "util/util_map.h"
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -42,6 +43,8 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #  endif
 
+typedef unordered_map<float, float> CoverageMap;
+
 struct Intersection;
 struct VolumeStep;
 
@@ -68,6 +71,11 @@ typedef struct KernelGlobals {
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
 
+	/* A buffer for storing per-pixel coverage for Cryptomatte. */
+	CoverageMap *coverage_object;
+	CoverageMap *coverage_material;
+	CoverageMap *coverage_asset;
+
 	/* split kernel */
 	SplitData split_data;
 	SplitParams split_param_data;
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
new file mode 100644
index 00000000000..ee3b8b8abfb
--- /dev/null
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -0,0 +1,94 @@
+/*
+* Copyright 2018 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, int num_slots, float id, float weight)
+{
+	kernel_assert(id != ID_NONE);
+	if(weight == 0.0f) {
+		return;
+	}
+	
+	for(int slot = 0; slot < num_slots; slot++) {
+		ccl_global float2 *id_buffer = (ccl_global float2*)buffer;
+#ifdef __ATOMIC_PASS_WRITE__
+		/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+		if(id_buffer[slot].x == ID_NONE) {
+			/* Use an atomic to claim this slot.
+			* If a different thread got here first, try again from this slot on. */
+			float old_id = atomic_compare_and_swap_float(buffer+slot*2, ID_NONE, id);
+			if(old_id != ID_NONE && old_id != id) {
+				continue;
+			}
+			atomic_add_and_fetch_float(buffer+slot*2+1, weight);
+			break;
+		}
+		/* If there already is a slot for that ID, add the weight.
+		 * If no slot was found, add it to the last. */
+		else if(id_buffer[slot].x == id || slot == num_slots - 1) {
+			atomic_add_and_fetch_float(buffer+slot*2+1, weight);
+			break;
+		}
+#else  /* __ATOMIC_PASS_WRITE__ */
+		/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+		if(id_buffer[slot].x == ID_NONE) {
+			id_buffer[slot].x = id;
+			id_buffer[slot].y = weight;
+			break;
+		}
+		/* If there already is a slot for that ID, add the weight.
+		* If no slot was found, add it to the last. */
+		else if(id_buffer[slot].x == id || slot == num_slots - 1) {
+			id_buffer[slot].y += weight;
+			break;
+		}
+#endif  /* __ATOMIC_PASS_WRITE__ */
+	}
+}
+
+ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
+{
+	ccl_global float2 *id_buffer = (ccl_global float2*)buffer;
+	for(int slot = 1; slot < num_slots; ++slot) {
+		if(id_buffer[slot].x == ID_NONE) {
+			return;
+		}
+		/* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
+		int i = slot;
+		while(i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
+			float2 swap = id_buffer[i];
+			id_buffer[i] = id_buffer[i - 1];
+			id_buffer[i - 1] = swap;
+			--i;
+		}
+	}
+}
+
+#ifdef __KERNEL_GPU__
+/* post-sorting for Cryptomatte */
+ccl_device void kernel_cryptomatte_post(KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+{
+	if(sample - 1 == kernel_data.integrator.aa_samples) {
+		int index = offset + x + y * stride;
+		int pass_stride = kernel_data.film.pass_stride;
+		ccl_global float *cryptomatte_buffer = buffer + index * pass_stride + kernel_data.film.pass_cryptomatte;
+		kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+	}
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..a8a43f3ea4a 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -25,4 +25,4 @@
 #include "util/util_texture.h"
 #include "util/util_transform.h"
 
-#endif /* __KERNEL_MATH_H__ */
+#endif  /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index 9b96bb80c32..dde93844dd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -187,7 +187,10 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u,
 ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 {
 	float3 R = 2*dot(N, I)*N - I;
-	if(dot(Ng, R) >= 0.05f) {
+
+	/* Reflection rays may always be at least as shallow as the incoming ray. */
+	float threshold = min(0.9f*dot(Ng, I), 0.01f);
+	if(dot(Ng, R) >= threshold) {
 		return N;
 	}
 
@@ -195,24 +198,88 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 	 * The X axis is found by normalizing the component of N that's orthogonal to Ng.
 	 * The Y axis isn't actually needed.
 	 */
-	float3 X = normalize(N - dot(N, Ng)*Ng);
-
-	/* Calculate N.z and N.x in the local coordinate system. */
-	float Iz = dot(I, Ng);
-	float Ix2 = sqr(dot(I, X)), Iz2 = sqr(Iz);
-	float Ix2Iz2 = Ix2 + Iz2;
-
-	float a = safe_sqrtf(Ix2*(Ix2Iz2 - sqr(0.05f)));
-	float b = Iz*0.05f + Ix2Iz2;
-	float c = (a + b > 0.0f)? (a + b) : (-a + b);
+	float NdotNg = dot(N, Ng);
+	float3 X = normalize(N - NdotNg*Ng);
+
+	/* Calculate N.z and N.x in the local coordinate system.
+	 *
+	 * The goal of this computation is to find a N' that is rotated towards Ng just enough
+	 * to lift R' above the threshold (here called t), therefore dot(R', Ng) = t.
+	 *
+	 * According to the standard reflection equation, this means that we want dot(2*dot(N', I)*N' - I, Ng) = t.
+	 *
+	 * Since the Z axis of our local coordinate system is Ng, dot(x, Ng) is just x.z, so we get 2*dot(N', I)*N'.z - I.z = t.
+	 *
+	 * The rotation is simple to express in the coordinate system we formed - since N lies in the X-Z-plane, we know that
+	 * N' will also lie in the X-Z-plane, so N'.y = 0 and therefore dot(N', I) = N'.x*I.x + N'.z*I.z .
+	 *
+	 * Furthermore, we want N' to be normalized, so N'.x = sqrt(1 - N'.z^2).
+	 *
+	 * With these simplifications, we get the final equation 2*(sqrt(1 - N'.z^2)*I.x + N'.z*I.z)*N'.z - I.z = t.
+	 *
+	 * The only unknown here is N'.z, so we can solve for that.
+	 *
+	 * The equation has four solutions in general:
+	 *
+	 * N'.z = +-sqrt(0.5*(+-sqrt(I.x^2*(I.x^2 + I.z^2 - t^2)) + t*I.z + I.x^2 + I.z^2)/(I.x^2 + I.z^2))
+	 * We can simplify this expression a bit by grouping terms:
+	 *
+	 * a = I.x^2 + I.z^2
+	 * b = sqrt(I.x^2 * (a - t^2))
+	 * c = I.z*t + a
+	 * N'.z = +-sqrt(0.5*(+-b + c)/a)
+	 *
+	 * Two solutions can immediately be discarded because they're negative so N' would lie in the lower hemisphere.
+	 */
+	float Ix = dot(I, X), Iz = dot(I, Ng);
+	float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+	float a = Ix2 + Iz2;
+
+	float b = safe_sqrtf(Ix2*(a - sqr(threshold)));
+	float c = Iz*threshold + a;
+
+	/* Evaluate both solutions.
+	 * In many cases one can be immediately discarded (if N'.z would be imaginary or larger than one), so check for that first.
+	 * If no option is viable (might happen in extreme cases like N being in the wrong hemisphere), give up and return Ng. */
+	float fac = 0.5f/a;
+	float N1_z2 = fac*(b+c), N2_z2 = fac*(-b+c);
+	bool valid1 = (N1_z2 > 1e-5f) && (N1_z2 <= (1.0f + 1e-5f));
+	bool valid2 = (N2_z2 > 1e-5f) && (N2_z2 <= (1.0f + 1e-5f));
+
+	float2 N_new;
+	if(valid1 && valid2) {
+		/* If both are possible, do the expensive reflection-based check. */
+		float2 N1 = make_float2(safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2));
+		float2 N2 = make_float2(safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2));
+
+		float R1 = 2*(N1.x*Ix + N1.y*Iz)*N1.y - Iz;
+		float R2 = 2*(N2.x*Ix + N2.y*Iz)*N2.y - Iz;
+
+		valid1 = (R1 >= 1e-5f);
+		valid2 = (R2 >= 1e-5f);
+		if(valid1 && valid2) {
+			/* If both solutions are valid, return the one with the shallower reflection since it will be closer to the input
+			 * (if the original reflection wasn't shallow, we would not be in this part of the function). */
+			N_new = (R1 < R2)? N1 : N2;
+		}
+		else {
+			/* If only one reflection is valid (= positive), pick that one. */
+			N_new = (R1 > R2)? N1 : N2;
+		}
 
-	float Nz = safe_sqrtf(0.5f * c * (1.0f / Ix2Iz2));
-	float Nx = safe_sqrtf(1.0f - sqr(Nz));
+	}
+	else if(valid1 || valid2) {
+		/* Only one solution passes the N'.z criterium, so pick that one. */
+		float Nz2 = valid1? N1_z2 : N2_z2;
+		N_new = make_float2(safe_sqrtf(1.0f - Nz2), safe_sqrtf(Nz2));
+	}
+	else {
+		return Ng;
+	}
 
-	/* Transform back into global coordinates. */
-	return Nx*X + Nz*Ng;
+	return N_new.x*X + N_new.y*Ng;
 }
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_MONTECARLO_CL__ */
+#endif  /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 458aa6c2a97..80477f921ea 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
 #if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
 #define __ATOMIC_PASS_WRITE__
 #endif
 
+#include "kernel/kernel_id_passes.h"
+
+CCL_NAMESPACE_BEGIN
+
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
 {
 	ccl_global float *buf = buffer;
@@ -108,7 +110,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob
 	float value = path_total_shaded / max(path_total, 1e-7f);
 	kernel_write_pass_float(buffer+2, value*value);
 }
-#endif /* __DENOISING_FEATURES__ */
+#endif  /* __DENOISING_FEATURES__ */
 
 ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
                                                         ShaderData *sd,
@@ -187,7 +189,24 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
 		                        L->debug_data.num_ray_bounces);
 	}
 }
-#endif /* __KERNEL_DEBUG__ */
+#endif  /* __KERNEL_DEBUG__ */
+
+#ifdef __KERNEL_CPU__
+#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
+ccl_device_inline size_t kernel_write_id_pass_cpu(float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+{
+	if(map) {
+		(*map)[id] += matte_weight;
+		return 0;
+	}
+#else  /* __KERNEL_CPU__ */
+#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight) 
+ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer, size_t depth, float id, float matte_weight)
+{
+#endif  /* __KERNEL_CPU__ */
+	kernel_write_id_slots(buffer, depth, id, matte_weight);
+	return depth * 2;
+}
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
@@ -242,6 +261,26 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		}
 	}
 
+	if(kernel_data.film.cryptomatte_passes) {
+		const float matte_weight = average(throughput) * (1.0f - average(shader_bsdf_transparency(kg, sd)));
+		if(matte_weight > 0.0f) {
+			ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+			if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+				float id = object_cryptomatte_id(kg, sd->object);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+			}
+			if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+				float id = shader_cryptomatte_id(kg, sd->shader);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+			}
+			if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+				float id = object_cryptomatte_asset_id(kg, sd->object);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+			}
+		}
+	}
+
+
 	if(light_flag & PASSMASK_COMPONENT(DIFFUSE))
 		L->color_diffuse += shader_bsdf_diffuse(kg, sd)*throughput;
 	if(light_flag & PASSMASK_COMPONENT(GLOSSY))
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 5745762e183..cb1f410b09f 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -266,7 +266,7 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
 }
 #endif  /* __VOLUME__ */
 
-#endif /* __SPLIT_KERNEL__ */
+#endif  /* __SPLIT_KERNEL__ */
 
 ccl_device_forceinline bool kernel_path_shader_apply(
 	KernelGlobals *kg,
@@ -434,7 +434,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		else if(result == VOLUME_PATH_MISSED) {
 			break;
 		}
-#endif /* __VOLUME__*/
+#endif  /* __VOLUME__*/
 
 		/* Shade background. */
 		if(!hit) {
@@ -557,7 +557,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 }
 
-#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
+#endif  /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
 ccl_device_forceinline void kernel_path_integrate(
 	KernelGlobals *kg,
@@ -605,7 +605,7 @@ ccl_device_forceinline void kernel_path_integrate(
 		else if(result == VOLUME_PATH_MISSED) {
 			break;
 		}
-#endif /* __VOLUME__*/
+#endif  /* __VOLUME__*/
 
 		/* Shade background. */
 		if(!hit) {
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index feaea15d3c4..d2506fc1e7e 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
 			}
 		}
 	}
-#endif /* __EMISSION__ */
+#endif  /* __EMISSION__ */
 }
 
 #ifdef __KERNEL_GPU__
@@ -277,10 +277,10 @@ ccl_device void kernel_branched_path_volume_connect_light(
 			}
 		}
 	}
-#endif /* __EMISSION__ */
+#endif  /* __EMISSION__ */
 }
-#endif /* __SPLIT_KERNEL__ */
+#endif  /* __SPLIT_KERNEL__ */
 
-#endif /* __VOLUME_SCATTER__ */
+#endif  /* __VOLUME_SCATTER__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index e32d4bbbc1b..de8cc4a0cef 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -145,4 +145,4 @@ ccl_device int dequeue_ray_index(
 
 CCL_NAMESPACE_END
 
-#endif // __KERNEL_QUEUE_H__
+#endif  // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index b33e4eba8a4..61ddf4a4f81 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -50,7 +50,7 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 	return result;
 }
 
-#endif /* __SOBOL__ */
+#endif  /* __SOBOL__ */
 
 
 ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index e834b701f96..af883aa715b 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -1276,4 +1276,9 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 }
 #endif  /* __TRANSPARENT_SHADOWS__ */
 
+ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+{
+	return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 8a0da6c3b13..fafa3ad4bfa 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -446,7 +446,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
 }
 
 #  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif /* __TRANSPARENT_SHADOWS__ */
+#endif  /* __TRANSPARENT_SHADOWS__ */
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
                                       ShaderData *sd,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index e93100a6442..864aa7c470a 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -17,6 +17,12 @@
 #ifndef __KERNEL_TYPES_H__
 #define __KERNEL_TYPES_H__
 
+#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
+#  include <embree3/rtcore.h>
+#  include <embree3/rtcore_scene.h>
+#  define __EMBREE__
+#endif
+
 #include "kernel/kernel_math.h"
 #include "kernel/svm/svm_types.h"
 #include "util/util_static_assert.h"
@@ -53,6 +59,7 @@ CCL_NAMESPACE_BEGIN
 #define OBJECT_NONE				(~0)
 #define PRIM_NONE				(~0)
 #define LAMP_NONE				(~0)
+#define ID_NONE					(0.0f)
 
 #define VOLUME_STACK_SIZE		32
 
@@ -415,6 +422,7 @@ typedef enum PassType {
 	PASS_RAY_BOUNCES,
 #endif
 	PASS_RENDER_TIME,
+	PASS_CRYPTOMATTE,
 	PASS_CATEGORY_MAIN_END = 31,
 
 	PASS_MIST = 32,
@@ -443,6 +451,14 @@ typedef enum PassType {
 
 #define PASS_ANY (~0)
 
+typedef enum CryptomatteType {
+	CRYPT_NONE = 0,
+	CRYPT_OBJECT = (1 << 0),
+	CRYPT_MATERIAL = (1 << 1),
+	CRYPT_ASSET = (1 << 2),
+	CRYPT_ACCURATE = (1 << 3),
+} CryptomatteType;
+
 typedef enum DenoisingPassOffsets {
 	DENOISING_PASS_NORMAL             = 0,
 	DENOISING_PASS_NORMAL_VAR         = 3,
@@ -599,7 +615,7 @@ typedef ccl_addr_space struct PathRadiance {
 
 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
-#endif /* __KERNEL_DEBUG__ */
+#endif  /* __KERNEL_DEBUG__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -712,6 +728,9 @@ typedef struct Ray {
 /* Intersection */
 
 typedef struct Intersection {
+#ifdef __EMBREE__
+	float3 Ng;
+#endif
 	float t, u, v;
 	int prim;
 	int object;
@@ -1260,6 +1279,9 @@ typedef struct KernelFilm {
 	int pass_shadow;
 	float pass_shadow_scale;
 	int filter_table_offset;
+	int cryptomatte_passes;
+	int cryptomatte_depth;
+	int pass_cryptomatte;
 
 	int pass_mist;
 	float mist_start;
@@ -1270,8 +1292,6 @@ typedef struct KernelFilm {
 	int pass_denoising_clean;
 	int denoising_flags;
 
-	int pad1, pad2, pad3;
-
 	/* XYZ to rendering color space transform. float4 instead of float3 to
 	 * ensure consistent padding/alignment across devices. */
 	float4 xyz_to_r;
@@ -1385,20 +1405,29 @@ typedef enum KernelBVHLayout {
 	BVH_LAYOUT_BVH2 = (1 << 0),
 	BVH_LAYOUT_BVH4 = (1 << 1),
 	BVH_LAYOUT_BVH8 = (1 << 2),
-
+	BVH_LAYOUT_EMBREE = (1 << 3),
 	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
 	BVH_LAYOUT_ALL = (unsigned int)(-1),
 } KernelBVHLayout;
 
 typedef struct KernelBVH {
-	/* root node */
+	/* Own BVH */
 	int root;
 	int have_motion;
 	int have_curves;
 	int have_instancing;
 	int bvh_layout;
 	int use_bvh_steps;
+
+	/* Embree */
+#ifdef __EMBREE__
+	RTCScene scene;
+#  ifndef __KERNEL_64_BIT__
+	int pad1;
+#  endif
+#else
 	int pad1, pad2;
+#endif
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
@@ -1460,7 +1489,11 @@ typedef struct KernelObject {
 	uint patch_map_offset;
 	uint attribute_map_offset;
 	uint motion_offset;
-	uint pad;
+	uint pad1;
+
+	float cryptomatte_object;
+	float cryptomatte_asset;
+	float pad2, pad3;
 } KernelObject;
 static_assert_align(KernelObject, 16);
 
@@ -1540,7 +1573,7 @@ static_assert_align(KernelParticle, 16);
 
 typedef struct KernelShader {
 	float constant_emission[3];
-	float pad1;
+	float cryptomatte_id;
 	int flags;
 	int pass_id;
 	int pad2, pad3;
@@ -1672,4 +1705,4 @@ typedef struct WorkTile {
 
 CCL_NAMESPACE_END
 
-#endif /*  __KERNEL_TYPES_H__ */
+#endif  /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index d71761a97bc..d6d283c42c5 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -87,7 +87,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
 	return true;
 }
 
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 ccl_device float3 volume_color_transmittance(float3 sigma, float t)
 {
@@ -270,7 +270,7 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
 		kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
 }
 
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 /* Equi-angular sampling as in:
  * "Importance Sampling Techniques for Path Tracing in Participating Media" */
@@ -1075,7 +1075,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	return VOLUME_PATH_SCATTERED;
 }
-#endif /* __SPLIT_KERNEL */
+#endif  /* __SPLIT_KERNEL */
 
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
@@ -1377,6 +1377,6 @@ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
 	}
 }
 
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index b62aa9663ec..e036b53b810 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 26777fdabb2..4c758711481 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
@@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index b77b7350d86..ae4fd85780d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -26,7 +26,7 @@ template<typename T> struct TextureInterpolator  {
 		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
 		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
 		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
+	} (void) 0
 
 	static ccl_always_inline float4 read(float4 r)
 	{
@@ -540,4 +540,4 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 
 CCL_NAMESPACE_END
 
-#endif // __KERNEL_CPU_IMAGE_H__
+#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 5ec1655ab05..759b7e4c20d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -97,7 +97,7 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 	{
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 /* Film */
@@ -120,7 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -141,7 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -176,7 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	else {
 		kernel_background_evaluate(kg, input, output, i);
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 #else  /* __SPLIT_KERNEL__ */
@@ -208,7 +208,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		ccl_local type locals; \
 		kernel_##name(kg, &locals); \
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 0561c40e6b1..b856cbde45c 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int w,
                                        int h,
                                        int stride,
-                                       int shift_stride,
+                                       int pass_stride,
                                        int r,
                                        int channel_offset,
                                        float a,
@@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
 		                                  weight_image,
 		                                  variance_image,
@@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
                             int w,
                             int h,
                             int stride,
-                            int shift_stride,
+                            int pass_stride,
                             int r,
                             int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_blur(co.x, co.y,
 		                       difference_image + ofs,
 		                       out_image + ofs,
@@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                    int w,
                                    int h,
                                    int stride,
-                                   int shift_stride,
+                                   int pass_stride,
                                    int r,
                                    int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_weight(co.x, co.y,
 		                              difference_image + ofs,
 		                              out_image + ofs,
@@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      int w,
                                      int h,
                                      int stride,
-                                     int shift_stride,
+                                     int pass_stride,
                                      int r,
                                      int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
 		                                difference_image + ofs,
 		                                image,
@@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
-                                         int f,
-                                         int pass_stride)
+                                         int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
 		kernel_filter_nlm_construct_gramian(co.x, co.y,
 		                                    co.z, co.w,
 		                                    difference_image + ofs,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 8a180a509e8..af311027f78 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -40,14 +40,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 
 #ifdef __BRANCHED_PATH__
@@ -56,14 +63,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
 kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+	
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 3c75754fb39..a550f97f4eb 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int w,
                                                     int h,
                                                     int stride,
-                                                    int shift_stride,
+                                                    int pass_stride,
                                                     int r,
                                                     int channel_offset,
                                                     float a,
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
 		                                  weight_image,
 		                                  variance_image,
@@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
                                          int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_blur(co.x, co.y,
 		                       difference_image + ofs,
 		                       out_image + ofs,
@@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest
                                                 int w,
                                                 int h,
                                                 int stride,
-                                                int shift_stride,
+                                                int pass_stride,
                                                 int r,
                                                 int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_weight(co.x, co.y,
 		                              difference_image + ofs,
 		                              out_image + ofs,
@@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                   int w,
                                                   int h,
                                                   int stride,
-                                                  int shift_stride,
+                                                  int pass_stride,
                                                   int r,
                                                   int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
 		                                difference_image + ofs,
 		                                image,
@@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int w,
                                                       int h,
                                                       int stride,
-                                                      int shift_stride,
+                                                      int pass_stride,
                                                       int r,
-                                                      int f,
-                                                      int pass_stride)
+                                                      int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
 		kernel_filter_nlm_construct_gramian(co.x, co.y,
 		                                    co.z, co.w,
 		                                    difference_image + ofs,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 63128d0aecf..de1f5088629 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -66,9 +66,17 @@ __kernel void kernel_ocl_path_trace(
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
+	bool thread_is_active = x < sx + sw && y < sy + sh;
+	if(thread_is_active) {
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
+	}
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(thread_is_active) {
+			kernel_cryptomatte_post(kg, buffer, sample, x, y, offset, stride);
+		}
+	}
 }
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index dd9d683e030..79af831c2fb 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -142,7 +142,7 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
 		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
 		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
+	} (void) 0
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index d9aeb9ab9fb..2a50704b569 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -146,4 +146,4 @@ CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure)
 
 CCL_NAMESPACE_END
 
-#endif /* __OSL_CLOSURES_H__ */
+#endif  /* __OSL_CLOSURES_H__ */
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 30b29793e2d..88192fbcccb 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -94,4 +94,4 @@ CCL_NAMESPACE_END
 
 #endif
 
-#endif /* __OSL_GLOBALS_H__ */
+#endif  /* __OSL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 7902381440b..97f97a4887e 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -884,6 +884,23 @@ bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlo
 	return false; /* never called by OSL */
 }
 
+TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring filename)
+{
+	if(filename.length() && filename[0] == '@') {
+		/* Dummy, we don't use texture handles for builtin textures but need
+		 * to tell the OSL runtime optimizer that this is a valid texture. */
+		return NULL;
+	}
+	else {
+		return texturesys()->get_texture_handle(filename);
+	}
+}
+
+bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
+{
+	return texturesys()->good(texture_handle);
+}
+
 bool OSLRenderServices::texture(ustring filename,
                                 TextureHandle *texture_handle,
                                 TexturePerthread *texture_thread_info,
@@ -894,7 +911,8 @@ bool OSLRenderServices::texture(ustring filename,
                                 int nchannels,
                                 float *result,
                                 float *dresultds,
-                                float *dresultdt)
+                                float *dresultdt,
+                                ustring *errormessage)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -1035,7 +1053,7 @@ bool OSLRenderServices::texture(ustring filename,
 		 * other nasty stuff happening.
 		 */
 		string err = ts->geterror();
-		(void)err;
+		(void) err;
 	}
 
 	return status;
@@ -1114,7 +1132,7 @@ bool OSLRenderServices::texture3d(ustring filename,
 		 * other nasty stuff happening.
 		 */
 		string err = ts->geterror();
-		(void)err;
+		(void) err;
 	}
 
 	return status;
@@ -1156,7 +1174,13 @@ bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filenam
                                          TypeDesc datatype, void *data)
 {
 	OSL::TextureSystem *ts = osl_ts;
-	return ts->get_texture_info(filename, subimage, dataname, datatype, data);
+	if(filename.length() && filename[0] == '@') {
+		/* Special builtin textures. */
+		return false;
+	}
+	else {
+		return ts->get_texture_info(filename, subimage, dataname, datatype, data);
+	}
 }
 
 int OSLRenderServices::pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 &center,
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 50044746fd1..712b06b41b8 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -93,6 +93,10 @@ public:
 	bool getmessage(OSL::ShaderGlobals *sg, ustring source, ustring name,
 	                TypeDesc type, void *val, bool derivatives);
 
+	TextureSystem::TextureHandle *get_texture_handle(ustring filename);
+
+	bool good(TextureSystem::TextureHandle *texture_handle);
+
 	bool texture(ustring filename,
 	             TextureSystem::TextureHandle *texture_handle,
 	             TexturePerthread *texture_thread_info,
@@ -103,7 +107,8 @@ public:
 	             int nchannels,
 	             float *result,
 	             float *dresultds,
-	             float *dresultdt);
+	             float *dresultdt,
+	             ustring *errormessage);
 
 	bool texture3d(ustring filename,
 	               TextureHandle *texture_handle,
@@ -194,4 +199,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __OSL_SERVICES_H__  */
+#endif  /* __OSL_SERVICES_H__  */
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 6a690e880ad..a89bb3fd1a3 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -193,7 +193,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
 			float data[9];
 			bool found = kg->osl->services->get_attribute(sd, true, OSLRenderServices::u_empty, TypeDesc::TypeVector,
 			                                              OSLRenderServices::u_geom_undisplaced, data);
-			(void)found;
+			(void) found;
 			assert(found);
 
 			memcpy(&sd->P, data, sizeof(float)*3);
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 571a3f502be..9824f966a44 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -66,4 +66,4 @@ CCL_NAMESPACE_END
 
 #endif
 
-#endif /* __OSL_SHADER_H__ */
+#endif  /* __OSL_SHADER_H__ */
diff --git a/intern/cycles/kernel/shaders/oslutil.h b/intern/cycles/kernel/shaders/oslutil.h
index 141e5d27e3a..592a8ad12d9 100644
--- a/intern/cycles/kernel/shaders/oslutil.h
+++ b/intern/cycles/kernel/shaders/oslutil.h
@@ -92,4 +92,4 @@ float wireframe(string edge_type, float line_width) { return wireframe(edge_type
 float wireframe(string edge_type) { return wireframe(edge_type, 1.0, 1); }
 float wireframe() { return wireframe("polygons", 1.0, 1); }
 
-#endif /* CCL_OSLUTIL_H */
+#endif  /* CCL_OSLUTIL_H */
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 4a8378796ba..7136c746321 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -284,33 +284,63 @@ point rotate (point p, float angle, point a, point b)
 
 normal ensure_valid_reflection(normal Ng, vector I, normal N)
 {
+    /* The implementation here mirrors the one in kernel_montecarlo.h,
+     * check there for an explanation of the algorithm. */
+
     float sqr(float x) { return x*x; }
 
     vector R = 2*dot(N, I)*N - I;
-    if (dot(Ng, R) >= 0.05) {
+
+    float threshold = min(0.9*dot(Ng, I), 0.01);
+    if(dot(Ng, R) >= threshold) {
         return N;
     }
 
-    /* Form coordinate system with Ng as the Z axis and N inside the X-Z-plane.
-     * The X axis is found by normalizing the component of N that's orthogonal to Ng.
-     * The Y axis isn't actually needed.
-     */
-    vector X = normalize(N - dot(N, Ng)*Ng);
+    float NdotNg = dot(N, Ng);
+    vector X = normalize(N - NdotNg*Ng);
 
-    /* Calculate N.z and N.x in the local coordinate system. */
     float Ix = dot(I, X), Iz = dot(I, Ng);
-    float Ix2 = sqr(dot(I, X)), Iz2 = sqr(dot(I, Ng));
-    float Ix2Iz2 = Ix2 + Iz2;
-
-    float a = sqrt(Ix2*(Ix2Iz2 - sqr(0.05)));
-    float b = Iz*0.05 + Ix2Iz2;
-    float c = (a + b > 0.0)? (a + b) : (-a + b);
+    float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+    float a = Ix2 + Iz2;
+
+    float b = sqrt(Ix2*(a - sqr(threshold)));
+    float c = Iz*threshold + a;
+
+    float fac = 0.5/a;
+    float N1_z2 = fac*(b+c), N2_z2 = fac*(-b+c);
+    int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
+    int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
+
+    float N_new_x, N_new_z;
+    if(valid1 && valid2) {
+        float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
+        float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
+
+        float R1 = 2*(N1_x*Ix + N1_z*Iz)*N1_z - Iz;
+        float R2 = 2*(N2_x*Ix + N2_z*Iz)*N2_z - Iz;
+
+        valid1 = (R1 >= 1e-5);
+        valid2 = (R2 >= 1e-5);
+        if(valid1 && valid2) {
+            N_new_x = (R1 < R2)? N1_x : N2_x;
+            N_new_z = (R1 < R2)? N1_z : N2_z;
+        }
+        else {
+            N_new_x = (R1 > R2)? N1_x : N2_x;
+            N_new_z = (R1 > R2)? N1_z : N2_z;
+        }
 
-    float Nz = sqrt(0.5 * c * (1.0 / Ix2Iz2));
-    float Nx = sqrt(1.0 - sqr(Nz));
+    }
+    else if(valid1 || valid2) {
+        float Nz2 = valid1? N1_z2 : N2_z2;
+        N_new_x = sqrt(1.0 - Nz2);
+        N_new_z = sqrt(Nz2);
+    }
+    else {
+        return Ng;
+    }
 
-    /* Transform back into global coordinates. */
-    return Nx*X + Nz*Ng;
+    return N_new_x*X + N_new_z*Ng;
 }
 
 
@@ -485,7 +515,7 @@ float smooth_linearstep (float edge0, float edge1, float x_, float eps_) {
         else if (x >= eps && x <= 1.0-eps) result = x;
         else if (x >= 1.0+eps)             result = 1;
         else if (x < eps)                  result = rampup (x+eps, 2.0*eps);
-        else /* if (x < 1.0+eps) */        result = 1.0 - rampup (1.0+eps - x, 2.0*eps);
+        else  /* if (x < 1.0+eps) */        result = 1.0 - rampup (1.0+eps - x, 2.0*eps);
     } else {
         result = step (edge0, x_);
     }
@@ -656,4 +686,4 @@ int getmatrix (string fromspace, output matrix M) {
 #undef PERCOMP2
 #undef PERCOMP2F
 
-#endif /* CCL_STDOSL_H */
+#endif  /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 180c0b57077..18eec6372f1 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -80,8 +80,10 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	bool ray_was_updated = false;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		ray_was_updated = true;
 		uint sample = state->sample;
 		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
 		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
@@ -92,6 +94,17 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
+			uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+			ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+			ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+			kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+		}
+	}
+
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		/* We have completed current work; So get next work */
 		ccl_global uint *work_pools = kernel_split_params.work_pools;
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
index 2132c42220f..666355de334 100644
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -78,7 +78,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 			}
 		}
 	}
-#  endif /* __KERNEL_OPENCL__ */
+#  endif  /* __KERNEL_OPENCL__ */
 
 	/* copy to destination */
 	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
@@ -91,7 +91,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 			kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
 		}
 	}
-#endif /* __KERNEL_CUDA__ */
+#endif  /* __KERNEL_CUDA__ */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 9297e1e0ad5..3f6b3977d79 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
 {
-	(void)kg;  /* Unused on CPU. */
+	(void) kg;  /* Unused on CPU. */
 
 	uint64_t size = 0;
 #define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
@@ -48,7 +48,7 @@ ccl_device_inline void split_data_init(KernelGlobals *kg,
                                        ccl_global void *data,
                                        ccl_global char *ray_state)
 {
-	(void)kg;  /* Unused on CPU. */
+	(void) kg;  /* Unused on CPU. */
 
 	ccl_global char *p = (ccl_global char*)data;
 
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 56194d9f857..83df1e2a0a6 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -86,14 +86,14 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
 #else
 #  define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
+#endif  /* __SUBSURFACE__ */
 
 #ifdef __VOLUME__
 #  define SPLIT_DATA_VOLUME_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
 #else
 #  define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 #define SPLIT_DATA_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index ab69afa051e..ccb9aef7a5b 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -313,7 +313,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 			case NODE_LEAVE_BUMP_EVAL:
 				svm_node_leave_bump_eval(kg, sd, stack, node.y);
 				break;
-#      endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
+#      endif  /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
 #    endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
 			case NODE_HSV:
 				svm_node_hsv(kg, sd, stack, node, &offset);
@@ -497,4 +497,4 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_H__ */
+#endif  /* __SVM_H__ */
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 64bf8244999..3cf33f4d431 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -262,7 +262,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					        ? (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra))
 					        : NULL;
 
-					if (bsdf && extra) {
+					if(bsdf && extra) {
 						bsdf->N = N;
 						bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
 						bsdf->T = T;
@@ -285,7 +285,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						/* setup bsdf */
 						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
 							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
-						else /* use multi-scatter GGX */
+						else  /* use multi-scatter GGX */
 							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
 					}
 				}
@@ -314,7 +314,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 							        ? (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra))
 							        : NULL;
 
-							if (bsdf && extra) {
+							if(bsdf && extra) {
 								bsdf->N = N;
 								bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 								bsdf->extra = extra;
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index 27127b85323..41538d1138d 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -59,4 +59,4 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_HSV_H__ */
+#endif  /* __SVM_HSV_H__ */
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index a3e4b6e87cd..6f39391057e 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -108,4 +108,4 @@ ccl_device void svm_node_curves(KernelGlobals *kg, ShaderData *sd, float *stack,
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_RAMP_H__ */
+#endif  /* __SVM_RAMP_H__ */
diff --git a/intern/cycles/kernel/svm/svm_ramp_util.h b/intern/cycles/kernel/svm/svm_ramp_util.h
index a67689ff9d1..847108ff1c2 100644
--- a/intern/cycles/kernel/svm/svm_ramp_util.h
+++ b/intern/cycles/kernel/svm/svm_ramp_util.h
@@ -95,4 +95,4 @@ ccl_device float float_ramp_lookup(const float *ramp,
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_RAMP_UTIL_H__ */
+#endif  /* __SVM_RAMP_UTIL_H__ */
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 910537a2539..0f1dfa4936b 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -531,4 +531,4 @@ typedef enum ClosureType {
 
 CCL_NAMESPACE_END
 
-#endif /*  __SVM_TYPES_H__ */
+#endif  /*  __SVM_TYPES_H__ */
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 7b60ab6e6ae..80b63dc80cd 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -24,7 +24,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, NodeWaveProfile profile, f
 
 	if(type == NODE_WAVE_BANDS)
 		n = (p.x + p.y + p.z) * 10.0f;
-	else /* NODE_WAVE_RINGS */
+	else  /* NODE_WAVE_RINGS */
 		n = len(p) * 20.0f;
 
 	if(distortion != 0.0f)
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 7d2220f37f9..c0ce7368771 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
 	buffers.cpp
 	camera.cpp
 	constant_fold.cpp
+	coverage.cpp
 	film.cpp
 	graph.cpp
 	image.cpp
@@ -46,6 +47,7 @@ set(SRC_HEADERS
 	buffers.h
 	camera.h
 	constant_fold.h
+	coverage.h
 	film.h
 	graph.h
 	image.h
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index 40e5be2e1b2..e7438f4513d 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -172,4 +172,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __ATTRIBUTE_H__ */
+#endif  /* __ATTRIBUTE_H__ */
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 3f56dedb2c8..17c3eaaaaf5 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -59,4 +59,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __BACKGROUND_H__ */
+#endif  /* __BACKGROUND_H__ */
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index a811eac3327..fce8f2fa606 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -83,4 +83,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __BAKE_H__ */
+#endif  /* __BAKE_H__ */
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 4cd8b3726d3..f901885e679 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -147,7 +147,7 @@ bool RenderBuffers::copy_from_device()
 	return true;
 }
 
-bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
+bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample, int components, float *pixels)
 {
 	if(buffer.data() == NULL) {
 		return false;
@@ -155,19 +155,20 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp
 
 	float invsample = 1.0f/sample;
 	float scale = invsample;
-	bool variance = (offset == DENOISING_PASS_NORMAL_VAR) ||
-	                (offset == DENOISING_PASS_ALBEDO_VAR) ||
-	                (offset == DENOISING_PASS_DEPTH_VAR) ||
-	                (offset == DENOISING_PASS_COLOR_VAR);
-
-	if(offset == DENOISING_PASS_COLOR || offset == DENOISING_PASS_CLEAN) {
-		scale *= exposure;
+	bool variance = (type == DENOISING_PASS_NORMAL_VAR) ||
+	                (type == DENOISING_PASS_ALBEDO_VAR) ||
+	                (type == DENOISING_PASS_DEPTH_VAR) ||
+	                (type == DENOISING_PASS_COLOR_VAR);
+
+	float scale_exposure = scale;
+	if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) {
+		scale_exposure *= exposure;
 	}
-	else if(offset == DENOISING_PASS_COLOR_VAR) {
-		scale *= exposure*exposure;
+	else if(type == DENOISING_PASS_COLOR_VAR) {
+		scale_exposure *= exposure*exposure;
 	}
 
-	offset += params.get_denoising_offset();
+	int offset = type + params.get_denoising_offset();
 	int pass_stride = params.get_passes_size();
 	int size = params.width*params.height;
 
@@ -181,14 +182,14 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp
 
 		if(components == 1) {
 			for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) {
-				pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale;
+				pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
 			}
 		}
 		else if(components == 3) {
 			for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) {
-				pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale;
-				pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale;
-				pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale;
+				pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
+				pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure;
+				pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure;
 			}
 		}
 		else {
@@ -200,14 +201,28 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp
 
 		if(components == 1) {
 			for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-				pixels[0] = in[0]*scale;
+				pixels[0] = in[0]*scale_exposure;
 			}
 		}
 		else if(components == 3) {
 			for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-				pixels[0] = in[0]*scale;
-				pixels[1] = in[1]*scale;
-				pixels[2] = in[2]*scale;
+				pixels[0] = in[0]*scale_exposure;
+				pixels[1] = in[1]*scale_exposure;
+				pixels[2] = in[2]*scale_exposure;
+			}
+		}
+		else if(components == 4) {
+			assert(type == DENOISING_PASS_COLOR);
+
+			/* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
+			assert(params.passes[0].type == PASS_COMBINED);
+			float *in_combined = buffer.data();
+
+			for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
+				pixels[0] = in[0]*scale_exposure;
+				pixels[1] = in[1]*scale_exposure;
+				pixels[2] = in[2]*scale_exposure;
+				pixels[3] = saturate(in_combined[3]*scale);
 			}
 		}
 		else {
@@ -218,7 +233,7 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp
 	return true;
 }
 
-bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels)
+bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels, const string &name)
 {
 	if(buffer.data() == NULL) {
 		return false;
@@ -234,6 +249,14 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 			continue;
 		}
 
+		/* Tell Cryptomatte passes apart by their name. */
+		if(pass.type == PASS_CRYPTOMATTE) {
+			if(pass.name != name) {
+				pass_offset += pass.components;
+				continue;
+			}
+		}
+
 		float *in = buffer.data() + pass_offset;
 		int pass_stride = params.get_passes_size();
 
@@ -370,6 +393,17 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 					pixels[3] = f.w*invw;
 				}
 			}
+			else if(type == PASS_CRYPTOMATTE) {
+				for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+					float4 f = make_float4(in[0], in[1], in[2], in[3]);
+					/* x and z contain integer IDs, don't rescale them.
+					   y and w contain matte weights, they get scaled. */
+					pixels[0] = f.x;
+					pixels[1] = f.y * scale;
+					pixels[2] = f.z;
+					pixels[3] = f.w * scale;
+				}
+			}
 			else {
 				for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
 					float4 f = make_float4(in[0], in[1], in[2], in[3]);
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 1b06ffe33a6..46c3b89bd84 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -50,7 +50,7 @@ public:
 	int full_height;
 
 	/* passes */
-	array<Pass> passes;
+	vector<Pass> passes;
 	bool denoising_data_pass;
 	/* If only some light path types should be denoised, an additional pass is needed. */
 	bool denoising_clean_pass;
@@ -84,7 +84,7 @@ public:
 	void zero();
 
 	bool copy_from_device();
-	bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
+	bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels, const string &name);
 	bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
 };
 
@@ -146,4 +146,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __BUFFERS_H__ */
+#endif  /* __BUFFERS_H__ */
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index ec3c56e820a..34066e1b024 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -716,7 +716,7 @@ float Camera::world_to_raster_size(float3 P)
 		float3 D = transform_point(&worldtocamera, P);
 		float dist = len(D);
 
-		Ray ray = {0};
+		Ray ray = {{0}};
 
 		/* Distortion can become so great that the results become meaningless, there
 		 * may be a better way to do this, but calculating differentials from the
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 323f2c61ca4..37f5dea624f 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -21,6 +21,7 @@
 
 #include "graph/node.h"
 
+#include "util/util_array.h"
 #include "util/util_boundbox.h"
 #include "util/util_projection.h"
 #include "util/util_transform.h"
@@ -212,4 +213,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __CAMERA_H__ */
+#endif  /* __CAMERA_H__ */
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 26fa4e8b1c8..6ec94b055e3 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -70,4 +70,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __CONSTANT_FOLD_H__ */
+#endif  /* __CONSTANT_FOLD_H__ */
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
new file mode 100644
index 00000000000..72ef4cda3ff
--- /dev/null
+++ b/intern/cycles/render/coverage.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/coverage.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_id_passes.h"
+#include "kernel/kernel_types.h"
+#include "util/util_map.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool crypomatte_comp(const pair<float, float>& i, const pair<float, float> j) { return i.first > j.first; }
+
+void Coverage::finalize()
+{
+	int pass_offset = 0;
+	if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+		finalize_buffer(coverage_object, pass_offset);
+		pass_offset += kernel_data.film.cryptomatte_depth * 4;
+	}
+	if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+		finalize_buffer(coverage_material, pass_offset);
+		pass_offset += kernel_data.film.cryptomatte_depth * 4;
+	}
+	if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+		finalize_buffer(coverage_asset, pass_offset);
+	}
+}
+
+void Coverage::init_path_trace()
+{
+	kg->coverage_object = kg->coverage_material =  kg->coverage_asset = NULL;
+
+	if(kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
+		if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+			coverage_object.clear();
+			coverage_object.resize(tile.w * tile.h);
+		}
+		if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+			coverage_material.clear();
+			coverage_material.resize(tile.w * tile.h);
+		}
+		if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+			coverage_asset.clear();
+			coverage_asset.resize(tile.w * tile.h);
+		}
+	}
+}
+
+void Coverage::init_pixel(int x, int y)
+{
+	if(kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
+		const int pixel_index = tile.w * (y - tile.y) + x - tile.x;
+		if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+			kg->coverage_object = &coverage_object[pixel_index];
+		}
+		if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+			kg->coverage_material = &coverage_material[pixel_index];
+		}
+		if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+			kg->coverage_asset = &coverage_asset[pixel_index];
+		}
+	}
+}
+
+void Coverage::finalize_buffer(vector<CoverageMap> & coverage, const int pass_offset)
+{
+	if(kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
+		flatten_buffer(coverage, pass_offset);
+	}
+	else {
+		sort_buffer(pass_offset);
+	}
+}
+
+void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset)
+{
+	/* Sort the coverage map and write it to the output */
+	int pixel_index = 0;
+	int pass_stride = tile.buffers->params.get_passes_size();
+	for(int y = 0; y < tile.h; ++y) {
+		for(int x = 0; x < tile.w; ++x) {
+			const CoverageMap& pixel = coverage[pixel_index];
+			if(!pixel.empty()) {
+				/* buffer offset */
+				int index = x + y * tile.stride;
+				float *buffer = (float*)tile.buffer + index*pass_stride;
+
+				/* sort the cryptomatte pixel */
+				vector<pair<float, float> > sorted_pixel;
+				for(CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) {
+					sorted_pixel.push_back(std::make_pair(it->second, it->first));
+				}
+				sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp);
+				int num_slots = 2 * (kernel_data.film.cryptomatte_depth);
+				if(sorted_pixel.size() > num_slots) {
+					float leftover = 0.0f;
+					for(vector<pair<float, float> >::iterator it = sorted_pixel.begin()+num_slots; it != sorted_pixel.end(); ++it) {
+						leftover += it->first;
+					}
+					sorted_pixel[num_slots-1].first += leftover;
+				}
+				int limit = min(num_slots, sorted_pixel.size());
+				for(int i = 0; i < limit; ++i) {
+					kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset, 2 * (kernel_data.film.cryptomatte_depth), sorted_pixel[i].second, sorted_pixel[i].first);
+				}
+			}
+			++pixel_index;
+		}
+	}
+}
+
+void Coverage::sort_buffer(const int pass_offset)
+{
+	/* Sort the coverage map and write it to the output */
+	int pass_stride = tile.buffers->params.get_passes_size();
+	for(int y = 0; y < tile.h; ++y) {
+		for(int x = 0; x < tile.w; ++x) {
+			/* buffer offset */
+			int index = x + y*tile.stride;
+			float *buffer = (float*)tile.buffer + index*pass_stride;
+			kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset, 2 * (kernel_data.film.cryptomatte_depth));
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
new file mode 100644
index 00000000000..9ee0bce7517
--- /dev/null
+++ b/intern/cycles/render/coverage.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/buffers.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"
+#include "util/util_map.h"
+#include "util/util_vector.h"
+
+#ifndef __COVERAGE_H__
+#define __COVERAGE_H__
+
+CCL_NAMESPACE_BEGIN
+
+class Coverage {
+public:
+	Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_) { }
+	void init_path_trace();
+	void init_pixel(int x, int y);
+	void finalize();
+private:
+	vector<CoverageMap>coverage_object;
+	vector<CoverageMap>coverage_material;
+	vector<CoverageMap>coverage_asset;
+	KernelGlobals *kg;
+	RenderTile &tile;
+	void finalize_buffer(vector<CoverageMap>&coverage, const int pass_offset);
+	void flatten_buffer(vector<CoverageMap>&coverage, const int pass_offset);
+	void sort_buffer(const int pass_offset);
+};
+
+
+CCL_NAMESPACE_END
+
+#endif  /* __COVERAGE_H__ */
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index 62066d8a809..cf75751c58f 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -17,8 +17,8 @@
 #ifndef __CURVES_H__
 #define __CURVES_H__
 
+#include "util/util_array.h"
 #include "util/util_types.h"
-#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -119,4 +119,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __CURVES_H__ */
+#endif  /* __CURVES_H__ */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 8f3596ade58..d0f15496e50 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -38,11 +38,14 @@ static bool compare_pass_order(const Pass& a, const Pass& b)
 	return (a.components > b.components);
 }
 
-void Pass::add(PassType type, array<Pass>& passes)
+void Pass::add(PassType type, vector<Pass>& passes, const char *name)
 {
-	for(size_t i = 0; i < passes.size(); i++)
-		if(passes[i].type == type)
+	for(size_t i = 0; i < passes.size(); i++) {
+		if(passes[i].type == type &&
+		   (name ? (passes[i].name == name) : passes[i].name.empty())) {
 			return;
+		}
+	}
 
 	Pass pass;
 
@@ -50,6 +53,9 @@ void Pass::add(PassType type, array<Pass>& passes)
 	pass.filter = true;
 	pass.exposure = false;
 	pass.divide_type = PASS_NONE;
+	if(name) {
+		pass.name = name;
+	}
 
 	switch(type) {
 		case PASS_NONE:
@@ -155,13 +161,15 @@ void Pass::add(PassType type, array<Pass>& passes)
 			pass.components = 4;
 			pass.exposure = true;
 			break;
-
+		case PASS_CRYPTOMATTE:
+			pass.components = 4;
+			break;
 		default:
 			assert(false);
 			break;
 	}
 
-	passes.push_back_slow(pass);
+	passes.push_back(pass);
 
 	/* order from by components, to ensure alignment so passes with size 4
 	 * come first and then passes with size 1 */
@@ -171,19 +179,19 @@ void Pass::add(PassType type, array<Pass>& passes)
 		Pass::add(pass.divide_type, passes);
 }
 
-bool Pass::equals(const array<Pass>& A, const array<Pass>& B)
+bool Pass::equals(const vector<Pass>& A, const vector<Pass>& B)
 {
 	if(A.size() != B.size())
 		return false;
 
 	for(int i = 0; i < A.size(); i++)
-		if(A[i].type != B[i].type)
+		if(A[i].type != B[i].type || A[i].name != B[i].name)
 			return false;
 
 	return true;
 }
 
-bool Pass::contains(const array<Pass>& passes, PassType type)
+bool Pass::contains(const vector<Pass>& passes, PassType type)
 {
 	for(size_t i = 0; i < passes.size(); i++)
 		if(passes[i].type == type)
@@ -290,6 +298,7 @@ Film::Film()
 
 	use_light_visibility = false;
 	filter_table_offset = TABLE_OFFSET_INVALID;
+	cryptomatte_passes = CRYPT_NONE;
 
 	need_update = true;
 }
@@ -314,6 +323,8 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kfilm->pass_stride = 0;
 	kfilm->use_light_pass = use_light_visibility || use_sample_clamp;
 
+	bool have_cryptomatte = false;
+
 	for(size_t i = 0; i < passes.size(); i++) {
 		Pass& pass = passes[i];
 
@@ -434,7 +445,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 #endif
 			case PASS_RENDER_TIME:
 				break;
-
+			case PASS_CRYPTOMATTE:
+				kfilm->pass_cryptomatte = have_cryptomatte ? min(kfilm->pass_cryptomatte, kfilm->pass_stride) : kfilm->pass_stride;
+				have_cryptomatte = true;
+				break;
 			default:
 				assert(false);
 				break;
@@ -471,6 +485,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
 	kfilm->mist_falloff = mist_falloff;
 
+	kfilm->cryptomatte_passes = cryptomatte_passes;
+	kfilm->cryptomatte_depth = cryptomatte_depth;
+
 	pass_stride = kfilm->pass_stride;
 	denoising_data_offset = kfilm->pass_denoising_data;
 	denoising_clean_offset = kfilm->pass_denoising_clean;
@@ -490,7 +507,7 @@ bool Film::modified(const Film& film)
 	return !Node::equals(film) || !Pass::equals(passes, film.passes);
 }
 
-void Film::tag_passes_update(Scene *scene, const array<Pass>& passes_)
+void Film::tag_passes_update(Scene *scene, const vector<Pass>& passes_)
 {
 	if(Pass::contains(passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
 		scene->mesh_manager->tag_update(scene);
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 6ab2eea79b8..c597db4e4c5 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -45,10 +45,11 @@ public:
 	bool filter;
 	bool exposure;
 	PassType divide_type;
+	string name;
 
-	static void add(PassType type, array<Pass>& passes);
-	static bool equals(const array<Pass>& A, const array<Pass>& B);
-	static bool contains(const array<Pass>& passes, PassType);
+	static void add(PassType type, vector<Pass>& passes, const char* name = NULL);
+	static bool equals(const vector<Pass>& A, const vector<Pass>& B);
+	static bool contains(const vector<Pass>& passes, PassType);
 };
 
 class Film : public Node {
@@ -56,7 +57,7 @@ public:
 	NODE_DECLARE
 
 	float exposure;
-	array<Pass> passes;
+	vector<Pass> passes;
 	bool denoising_data_pass;
 	bool denoising_clean_pass;
 	int denoising_flags;
@@ -76,6 +77,8 @@ public:
 
 	bool use_light_visibility;
 	bool use_sample_clamp;
+	CryptomatteType cryptomatte_passes;
+	int cryptomatte_depth;
 
 	bool need_update;
 
@@ -86,10 +89,10 @@ public:
 	void device_free(Device *device, DeviceScene *dscene, Scene *scene);
 
 	bool modified(const Film& film);
-	void tag_passes_update(Scene *scene, const array<Pass>& passes_);
+	void tag_passes_update(Scene *scene, const vector<Pass>& passes_);
 	void tag_update(Scene *scene);
 };
 
 CCL_NAMESPACE_END
 
-#endif /* __FILM_H__ */
+#endif  /* __FILM_H__ */
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 426522066b3..d14a59b4900 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -293,4 +293,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __GRAPH_H__ */
+#endif  /* __GRAPH_H__ */
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index d94ebe564e3..8367a6811bd 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -164,4 +164,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __IMAGE_H__ */
+#endif  /* __IMAGE_H__ */
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index f68400ac416..6a7e2056851 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -94,4 +94,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __INTEGRATOR_H__ */
+#endif  /* __INTEGRATOR_H__ */
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 32a911dc256..f4dfe0cadbf 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -139,4 +139,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __LIGHT_H__ */
+#endif  /* __LIGHT_H__ */
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 8a00b88af12..5f884a3f871 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -39,6 +39,10 @@
 #include "util/util_progress.h"
 #include "util/util_set.h"
 
+#ifdef WITH_EMBREE
+#  include "bvh/bvh_embree.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Triangle */
@@ -1068,11 +1072,14 @@ void Mesh::compute_bvh(Device *device,
 			bparams.use_spatial_split = params->use_bvh_spatial_split;
 			bparams.bvh_layout = BVHParams::best_bvh_layout(
 			        params->bvh_layout,
-			        device->info.bvh_layout_mask);
+			        device->get_bvh_layout_mask());
 			bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 			                              params->use_bvh_unaligned_nodes;
 			bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
 			bparams.num_motion_curve_steps = params->num_bvh_time_steps;
+			bparams.bvh_type = params->bvh_type;
+			bparams.curve_flags = dscene->data.curve.curveflags;
+			bparams.curve_subdivisions = dscene->data.curve.subdivisions;
 
 			delete bvh;
 			bvh = BVH::create(bparams, objects);
@@ -1284,9 +1291,9 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 		}
 	}
 #else
-	(void)device;
-	(void)scene;
-	(void)mesh_attributes;
+	(void) device;
+	(void) scene;
+	(void) mesh_attributes;
 #endif
 }
 
@@ -1855,20 +1862,38 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	bparams.top_level = true;
 	bparams.bvh_layout = BVHParams::best_bvh_layout(
 	        scene->params.bvh_layout,
-	        device->info.bvh_layout_mask);
+	        device->get_bvh_layout_mask());
 	bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
 	bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 	                              scene->params.use_bvh_unaligned_nodes;
 	bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
 	bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
+	bparams.bvh_type = scene->params.bvh_type;
+	bparams.curve_flags = dscene->data.curve.curveflags;
+	bparams.curve_subdivisions = dscene->data.curve.subdivisions;
 
 	VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout)
 	        << " layout.";
 
+#ifdef WITH_EMBREE
+	if(bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
+		if(dscene->data.bvh.scene) {
+			BVHEmbree::destroy(dscene->data.bvh.scene);
+		}
+	}
+#endif
+
 	BVH *bvh = BVH::create(bparams, scene->objects);
-	bvh->build(progress);
+	bvh->build(progress, &device->stats);
 
 	if(progress.get_cancel()) {
+#ifdef WITH_EMBREE
+		if(bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
+			if(dscene->data.bvh.scene) {
+				BVHEmbree::destroy(dscene->data.bvh.scene);
+			}
+		}
+#endif
 		delete bvh;
 		return;
 	}
@@ -1923,6 +1948,16 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	dscene->data.bvh.bvh_layout = bparams.bvh_layout;
 	dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
 
+
+#ifdef WITH_EMBREE
+	if(bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
+		dscene->data.bvh.scene = ((BVHEmbree*)bvh)->scene;
+	}
+	else {
+		dscene->data.bvh.scene = NULL;
+	}
+#endif
+
 	delete bvh;
 }
 
@@ -2266,7 +2301,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 		og->object_names.clear();
 	}
 #else
-	(void)device;
+	(void) device;
 #endif
 }
 
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 444f03a3664..7d36b2cd7ca 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -22,6 +22,7 @@
 #include "render/attribute.h"
 #include "render/shader.h"
 
+#include "util/util_array.h"
 #include "util/util_boundbox.h"
 #include "util/util_list.h"
 #include "util/util_map.h"
@@ -390,4 +391,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __MESH_H__ */
+#endif  /* __MESH_H__ */
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 28bbe2de05a..048f0fcaa24 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -20,6 +20,7 @@
 #include "render/graph.h"
 #include "graph/node.h"
 
+#include "util/util_array.h"
 #include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
@@ -1161,4 +1162,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __NODES_H__ */
+#endif  /* __NODES_H__ */
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index e3f35c366d6..dc7a1043208 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -27,7 +27,9 @@
 #include "util/util_logging.h"
 #include "util/util_map.h"
 #include "util/util_progress.h"
+#include "util/util_set.h"
 #include "util/util_vector.h"
+#include "util/util_murmurhash.h"
 
 #include "subd/subd_patch_table.h"
 
@@ -483,6 +485,10 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
 	kobject.numverts = mesh->verts.size();
 	kobject.patch_map_offset = 0;
 	kobject.attribute_map_offset = 0;
+	uint32_t hash_name = util_murmur_hash3(ob->name.c_str(), ob->name.length(), 0);
+	uint32_t hash_asset = util_murmur_hash3(ob->asset_name.c_str(), ob->asset_name.length(), 0);
+	kobject.cryptomatte_object = util_hash_to_float(hash_name);
+	kobject.cryptomatte_asset = util_hash_to_float(hash_asset);
 
 	/* Object flag. */
 	if(ob->use_holdout) {
@@ -839,4 +845,37 @@ void ObjectManager::tag_update(Scene *scene)
 	scene->light_manager->need_update = true;
 }
 
+string ObjectManager::get_cryptomatte_objects(Scene *scene)
+{
+	string manifest = "{";
+
+	unordered_set<ustring, ustringHash> objects;
+	foreach(Object *object, scene->objects) {
+		if(objects.count(object->name)) {
+			continue;
+		}
+		objects.insert(object->name);
+		uint32_t hash_name = util_murmur_hash3(object->name.c_str(), object->name.length(), 0);
+		manifest += string_printf("\"%s\":\"%08x\",", object->name.c_str(), hash_name);
+	}
+	manifest[manifest.size()-1] = '}';
+	return manifest;
+}
+
+string ObjectManager::get_cryptomatte_assets(Scene *scene)
+{
+	string manifest = "{";
+	unordered_set<ustring, ustringHash> assets;
+	foreach(Object *ob, scene->objects) {
+		if(assets.count(ob->asset_name)) {
+			continue;
+		}
+		assets.insert(ob->asset_name);
+		uint32_t hash_asset = util_murmur_hash3(ob->asset_name.c_str(), ob->asset_name.length(), 0);
+		manifest += string_printf("\"%s\":\"%08x\",", ob->asset_name.c_str(), hash_asset);
+	}
+	manifest[manifest.size()-1] = '}';
+	return manifest;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index b80c4aef70b..87e6e6652ad 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -20,11 +20,13 @@
 #include "graph/node.h"
 #include "render/scene.h"
 
+#include "util/util_array.h"
 #include "util/util_boundbox.h"
 #include "util/util_param.h"
 #include "util/util_transform.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,6 +50,7 @@ public:
 	BoundBox bounds;
 	uint random_id;
 	int pass_id;
+	ustring asset_name;
 	vector<ParamValue> attributes;
 	uint visibility;
 	array<Transform> motion;
@@ -115,6 +118,9 @@ public:
 
 	void apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress& progress);
 
+	string get_cryptomatte_objects(Scene *scene);
+	string get_cryptomatte_assets(Scene *scene);
+
 protected:
 	void device_update_object_transform(UpdateObjectTransformState *state,
 	                                    Object *ob,
@@ -128,4 +134,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __OBJECT_H__ */
+#endif  /* __OBJECT_H__ */
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 496e9d9491a..3fbc7d33a74 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -1255,6 +1255,6 @@ void OSLCompiler::parameter_color_array(const char * /*name*/, const array<float
 {
 }
 
-#endif /* WITH_OSL */
+#endif  /* WITH_OSL */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index 966fc1965d7..e196e0be787 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -17,6 +17,7 @@
 #ifndef __OSL_H__
 #define __OSL_H__
 
+#include "util/util_array.h"
 #include "util/util_set.h"
 #include "util/util_string.h"
 #include "util/util_thread.h"
@@ -171,4 +172,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __OSL_H__  */
+#endif  /* __OSL_H__  */
diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h
index 7e7afd5d054..27821907af0 100644
--- a/intern/cycles/render/particles.h
+++ b/intern/cycles/render/particles.h
@@ -17,8 +17,8 @@
 #ifndef __PARTICLES_H__
 #define __PARTICLES_H__
 
+#include "util/util_array.h"
 #include "util/util_types.h"
-#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -68,4 +68,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __PARTICLES_H__ */
+#endif  /* __PARTICLES_H__ */
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 9f93fed139c..ccaca8707c8 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -215,6 +215,11 @@ void Scene::device_update(Device *device_, Progress& progress)
 	object_manager->device_update(device, &dscene, this, progress);
 
 	if(progress.get_cancel() || device->have_error()) return;
+	
+	progress.set_status("Updating Hair Systems");
+	curve_system_manager->device_update(device, &dscene, this, progress);
+
+	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Particle Systems");
 	particle_system_manager->device_update(device, &dscene, this, progress);
@@ -240,12 +245,7 @@ void Scene::device_update(Device *device_, Progress& progress)
 	camera->device_update_volume(device, &dscene, this);
 
 	if(progress.get_cancel() || device->have_error()) return;
-
-	progress.set_status("Updating Hair Systems");
-	curve_system_manager->device_update(device, &dscene, this, progress);
-
-	if(progress.get_cancel() || device->have_error()) return;
-
+	
 	progress.set_status("Updating Lookup Tables");
 	lookup_tables->device_update(device, &dscene);
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index dd8069537eb..57ea1d471e8 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -166,7 +166,6 @@ public:
 	bool use_bvh_spatial_split;
 	bool use_bvh_unaligned_nodes;
 	int num_bvh_time_steps;
-
 	bool persistent_data;
 	int texture_limit;
 
@@ -269,4 +268,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /*  __SCENE_H__ */
+#endif  /*  __SCENE_H__ */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index d0aa985b035..d6ecafa19b7 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -682,7 +682,10 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	BakeManager *bake_manager = scene->bake_manager;
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
-	requested_features.use_denoising = params.use_denoising;
+	if(params.denoising_passes) {
+		requested_features.use_denoising = true;
+		requested_features.use_shadow_tricks = true;
+	}
 
 	return requested_features;
 }
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 61f62f8e712..56a69919a7a 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -58,6 +58,7 @@ public:
 	bool display_buffer_linear;
 
 	bool use_denoising;
+	bool denoising_passes;
 	int denoising_radius;
 	float denoising_strength;
 	float denoising_feature_strength;
@@ -89,6 +90,7 @@ public:
 		threads = 0;
 
 		use_denoising = false;
+		denoising_passes = false;
 		denoising_radius = 8;
 		denoising_strength = 0.0f;
 		denoising_feature_strength = 0.0f;
@@ -236,4 +238,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __SESSION_H__ */
+#endif  /* __SESSION_H__ */
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index ac605305b94..d6c2d7502f2 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -30,6 +30,7 @@
 #include "render/tables.h"
 
 #include "util/util_foreach.h"
+#include "util/util_murmurhash.h"
 
 #ifdef WITH_OCIO
 #  include <OpenColorIO/OpenColorIO.h>
@@ -387,7 +388,7 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
 {
 	ShaderManager *manager;
 
-	(void)shadingsystem;  /* Ignored when built without OSL. */
+	(void) shadingsystem;  /* Ignored when built without OSL. */
 
 #ifdef WITH_OSL
 	if(shadingsystem == SHADINGSYSTEM_OSL) {
@@ -523,12 +524,15 @@ void ShaderManager::device_update_common(Device *device,
 		if(shader->is_constant_emission(&constant_emission))
 			flag |= SD_HAS_CONSTANT_EMISSION;
 
+		uint32_t cryptomatte_id = util_murmur_hash3(shader->name.c_str(), shader->name.length(), 0);
+		
 		/* regular shader */
 		kshader->flags = flag;
 		kshader->pass_id = shader->pass_id;
 		kshader->constant_emission[0] = constant_emission.x;
 		kshader->constant_emission[1] = constant_emission.y;
 		kshader->constant_emission[2] = constant_emission.z;
+		kshader->cryptomatte_id = util_hash_to_float(cryptomatte_id);
 		kshader++;
 
 		has_transparent_shadow |= (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
@@ -695,4 +699,20 @@ float ShaderManager::linear_rgb_to_gray(float3 c)
 	return dot(c, rgb_to_y);
 }
 
+string ShaderManager::get_cryptomatte_materials(Scene *scene)
+{
+	string manifest = "{";
+	unordered_set<ustring, ustringHash> materials;
+	foreach(Shader *shader, scene->shaders) {
+		if(materials.count(shader->name)) {
+			continue;
+		}
+		materials.insert(shader->name);
+		uint32_t cryptomatte_id = util_murmur_hash3(shader->name.c_str(), shader->name.length(), 0);
+		manifest += string_printf("\"%s\":\"%08x\",", shader->name.c_str(), cryptomatte_id);
+	}
+	manifest[manifest.size()-1] = '}';
+	return manifest;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 58314a1e310..4c7b2fd433b 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -197,6 +197,8 @@ public:
 
 	float linear_rgb_to_gray(float3 c);
 
+	string get_cryptomatte_materials(Scene *scene);
+
 protected:
 	ShaderManager();
 
@@ -222,4 +224,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __SHADER_H__ */
+#endif  /* __SHADER_H__ */
diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h
index d38857d2b35..ce7a28587f2 100644
--- a/intern/cycles/render/sobol.h
+++ b/intern/cycles/render/sobol.h
@@ -28,4 +28,4 @@ void sobol_generate_direction_vectors(uint vectors[][SOBOL_BITS], int dimensions
 
 CCL_NAMESPACE_END
 
-#endif /* __SOBOL_H__ */
+#endif  /* __SOBOL_H__ */
diff --git a/intern/cycles/render/stats.h b/intern/cycles/render/stats.h
index 72d5f1dd93d..2ff0ec3e0e9 100644
--- a/intern/cycles/render/stats.h
+++ b/intern/cycles/render/stats.h
@@ -101,4 +101,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __RENDER_STATS_H__ */
+#endif  /* __RENDER_STATS_H__ */
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 44b7eeec6db..b380117e729 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -266,7 +266,7 @@ int SVMCompiler::stack_assign(ShaderInput *input)
 				add_node(NODE_VALUE_V, input->stack_offset);
 				add_node(NODE_VALUE_V, node->get_float3(input->socket_type));
 			}
-			else /* should not get called for closure */
+			else  /* should not get called for closure */
 				assert(0);
 		}
 	}
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index 1d0613bbfdc..af97a490a87 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -21,6 +21,7 @@
 #include "render/graph.h"
 #include "render/shader.h"
 
+#include "util/util_array.h"
 #include "util/util_set.h"
 #include "util/util_string.h"
 #include "util/util_thread.h"
@@ -223,4 +224,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_H__ */
+#endif  /* __SVM_H__ */
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 09d961a9c3c..709333cb1b6 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -50,4 +50,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __TABLES_H__ */
+#endif  /* __TABLES_H__ */
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index f72f653f4c2..2835c793073 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -154,4 +154,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __TILE_H__ */
+#endif  /* __TILE_H__ */
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index 4617c782b3a..2bef8d4cf8d 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -134,4 +134,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __SUBD_DICE_H__ */
+#endif  /* __SUBD_DICE_H__ */
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index 64ec8f70951..84100139f2c 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -56,4 +56,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __SUBD_PATCH_H__ */
+#endif  /* __SUBD_PATCH_H__ */
diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp
index 13a6f284542..0e9d3f37af4 100644
--- a/intern/cycles/subd/subd_patch_table.cpp
+++ b/intern/cycles/subd/subd_patch_table.cpp
@@ -252,8 +252,8 @@ void PackedPatchTable::pack(Far::PatchTable* patch_table, int offset)
 
 	build_patch_map(*this, patch_table, offset);
 #else
-	(void)patch_table;
-	(void)offset;
+	(void) patch_table;
+	(void) offset;
 #endif
 }
 
diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h
index 45be7051992..1765578c42e 100644
--- a/intern/cycles/subd/subd_patch_table.h
+++ b/intern/cycles/subd/subd_patch_table.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_TABLE_H__
 #define __SUBD_PATCH_TABLE_H__
 
+#include "util/util_array.h"
 #include "util/util_types.h"
-#include "util/util_vector.h"
 
 #ifdef WITH_OPENSUBDIV
 #ifdef _MSC_VER
@@ -59,4 +59,4 @@ struct PackedPatchTable {
 
 CCL_NAMESPACE_END
 
-#endif /* __SUBD_PATCH_TABLE_H__ */
+#endif  /* __SUBD_PATCH_TABLE_H__ */
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index 7a276b35382..3368c93944b 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -56,4 +56,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __SUBD_SPLIT_H__ */
+#endif  /* __SUBD_SPLIT_H__ */
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index b66a91adbda..cfdab7a6433 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -20,6 +20,7 @@
 #include "render/graph.h"
 #include "render/scene.h"
 #include "render/nodes.h"
+#include "util/util_array.h"
 #include "util/util_logging.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
diff --git a/intern/cycles/test/util_path_test.cpp b/intern/cycles/test/util_path_test.cpp
index c2f400c105d..1df568493d8 100644
--- a/intern/cycles/test/util_path_test.cpp
+++ b/intern/cycles/test/util_path_test.cpp
@@ -370,7 +370,7 @@ TEST(util_path_is_relative, relative_windir_on_unix)
 	bool is_relative = path_is_relative("tmp\\foo.txt");
 	EXPECT_TRUE(is_relative);
 }
-#endif /* !_WIN32 */
+#endif  /* !_WIN32 */
 
 #ifdef _WIN32
 TEST(util_path_is_relative, absolute_windows)
@@ -396,6 +396,6 @@ TEST(util_path_is_relative, relative_unixdir_on_windows)
 	bool is_relative = path_is_relative("tmp/foo.txt");
 	EXPECT_TRUE(is_relative);
 }
-#endif /* _WIN32 */
+#endif  /* _WIN32 */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 291f9a9fcae..77d47984ee7 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
 	util_logging.cpp
 	util_math_cdf.cpp
 	util_md5.cpp
+	util_murmurhash.cpp
 	util_path.cpp
 	util_string.cpp
 	util_simd.cpp
@@ -36,6 +37,7 @@ set(SRC_HEADERS
 	util_algorithm.h
 	util_aligned_malloc.h
 	util_args.h
+	util_array.h
 	util_atomic.h
 	util_boundbox.h
 	util_debug.h
@@ -64,6 +66,7 @@ set(SRC_HEADERS
 	util_math_int4.h
 	util_math_matrix.h
 	util_md5.h
+	util_murmurhash.h
 	util_opengl.h
 	util_optimization.h
 	util_param.h
diff --git a/intern/cycles/util/util_algorithm.h b/intern/cycles/util/util_algorithm.h
index eb874713d43..f9e6476cc52 100644
--- a/intern/cycles/util/util_algorithm.h
+++ b/intern/cycles/util/util_algorithm.h
@@ -29,4 +29,4 @@ using std::remove;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_ALGORITHM_H__ */
+#endif  /* __UTIL_ALGORITHM_H__ */
diff --git a/intern/cycles/util/util_args.h b/intern/cycles/util/util_args.h
index be6f2c2b9f1..9fe54b14d77 100644
--- a/intern/cycles/util/util_args.h
+++ b/intern/cycles/util/util_args.h
@@ -28,4 +28,4 @@ OIIO_NAMESPACE_USING
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_ARGS_H__ */
+#endif  /* __UTIL_ARGS_H__ */
diff --git a/intern/cycles/util/util_array.h b/intern/cycles/util/util_array.h
new file mode 100644
index 00000000000..5f18d434c31
--- /dev/null
+++ b/intern/cycles/util/util_array.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_ARRAY_H__
+#define __UTIL_ARRAY_H__
+
+#include <cassert>
+#include <cstring>
+
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Simplified version of vector, serving multiple purposes:
+ * - somewhat faster in that it does not clear memory on resize/alloc,
+ *   this was actually showing up in profiles quite significantly. it
+ *   also does not run any constructors/destructors
+ * - if this is used, we are not tempted to use inefficient operations
+ * - aligned allocation for CPU native data types */
+
+template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES>
+class array
+{
+public:
+	array()
+	: data_(NULL),
+	  datasize_(0),
+	  capacity_(0)
+	{}
+
+	explicit array(size_t newsize)
+	{
+		if(newsize == 0) {
+			data_ = NULL;
+			datasize_ = 0;
+			capacity_ = 0;
+		}
+		else {
+			data_ = mem_allocate(newsize);
+			datasize_ = newsize;
+			capacity_ = datasize_;
+		}
+	}
+
+	array(const array& from)
+	{
+		if(from.datasize_ == 0) {
+			data_ = NULL;
+			datasize_ = 0;
+			capacity_ = 0;
+		}
+		else {
+			data_ = mem_allocate(from.datasize_);
+			memcpy(data_, from.data_, from.datasize_*sizeof(T));
+			datasize_ = from.datasize_;
+			capacity_ = datasize_;
+		}
+	}
+
+	array& operator=(const array& from)
+	{
+		if(this != &from) {
+			resize(from.size());
+			memcpy((void*)data_, from.data_, datasize_*sizeof(T));
+		}
+
+		return *this;
+	}
+
+	array& operator=(const vector<T>& from)
+	{
+		resize(from.size());
+
+		if(from.size() > 0) {
+			memcpy(data_, &from[0], datasize_*sizeof(T));
+		}
+
+		return *this;
+	}
+
+	~array()
+	{
+		mem_free(data_, capacity_);
+	}
+
+	bool operator==(const array<T>& other) const
+	{
+		if(datasize_ != other.datasize_) {
+			return false;
+		}
+
+		return memcmp(data_, other.data_, datasize_*sizeof(T)) == 0;
+	}
+
+	bool operator!=(const array<T>& other) const
+	{
+		return !(*this == other);
+	}
+
+	void steal_data(array& from)
+	{
+		if(this != &from) {
+			clear();
+
+			data_ = from.data_;
+			datasize_ = from.datasize_;
+			capacity_ = from.capacity_;
+
+			from.data_ = NULL;
+			from.datasize_ = 0;
+			from.capacity_ = 0;
+		}
+	}
+
+	T *steal_pointer()
+	{
+		T *ptr = data_;
+		data_ = NULL;
+		clear();
+		return ptr;
+	}
+
+	T* resize(size_t newsize)
+	{
+		if(newsize == 0) {
+			clear();
+		}
+		else if(newsize != datasize_) {
+			if(newsize > capacity_) {
+				T *newdata = mem_allocate(newsize);
+				if(newdata == NULL) {
+					/* Allocation failed, likely out of memory. */
+					clear();
+					return NULL;
+				}
+				else if(data_ != NULL) {
+					memcpy((void *)newdata,
+					       data_,
+					       ((datasize_ < newsize)? datasize_: newsize)*sizeof(T));
+					mem_free(data_, capacity_);
+				}
+				data_ = newdata;
+				capacity_ = newsize;
+			}
+			datasize_ = newsize;
+		}
+		return data_;
+	}
+
+	T* resize(size_t newsize, const T& value)
+	{
+		size_t oldsize = size();
+		resize(newsize);
+
+		for(size_t i = oldsize; i < size(); i++) {
+			data_[i] = value;
+		}
+
+		return data_;
+	}
+
+	void clear()
+	{
+		if(data_ != NULL) {
+			mem_free(data_, capacity_);
+			data_ = NULL;
+		}
+		datasize_ = 0;
+		capacity_ = 0;
+	}
+
+	size_t empty() const
+	{
+		return datasize_ == 0;
+	}
+
+	size_t size() const
+	{
+		return datasize_;
+	}
+
+	T* data()
+	{
+		return data_;
+	}
+
+	const T* data() const
+	{
+		return data_;
+	}
+
+	T& operator[](size_t i) const
+	{
+		assert(i < datasize_);
+		return data_[i];
+	}
+
+	void reserve(size_t newcapacity)
+	{
+		if(newcapacity > capacity_) {
+			T *newdata = mem_allocate(newcapacity);
+			if(data_ != NULL) {
+				memcpy(newdata, data_, ((datasize_ < newcapacity)? datasize_: newcapacity)*sizeof(T));
+				mem_free(data_, capacity_);
+			}
+			data_ = newdata;
+			capacity_ = newcapacity;
+		}
+	}
+
+	size_t capacity() const
+	{
+		return capacity_;
+	}
+
+	// do not use this method unless you are sure the code is not performance critical
+	void push_back_slow(const T& t)
+	{
+		if(capacity_ == datasize_)
+		{
+			reserve(datasize_ == 0 ? 1 : (size_t)((datasize_ + 1) * 1.2));
+		}
+
+		data_[datasize_++] = t;
+	}
+
+	void push_back_reserved(const T& t)
+	{
+		assert(datasize_ < capacity_);
+		push_back_slow(t);
+	}
+
+	void append(const array<T>& from)
+	{
+		if(from.size()) {
+			size_t old_size = size();
+			resize(old_size + from.size());
+			memcpy(data_ + old_size, from.data(), sizeof(T) * from.size());
+		}
+	}
+
+protected:
+	inline T* mem_allocate(size_t N)
+	{
+		if(N == 0) {
+			return NULL;
+		}
+		T *mem = (T*)util_aligned_malloc(sizeof(T)*N, alignment);
+		if(mem != NULL) {
+			util_guarded_mem_alloc(sizeof(T)*N);
+		}
+		else {
+			throw std::bad_alloc();
+		}
+		return mem;
+	}
+
+	inline void mem_free(T *mem, size_t N)
+	{
+		if(mem != NULL) {
+			util_guarded_mem_free(sizeof(T)*N);
+			util_aligned_free(mem);
+		}
+	}
+
+	T *data_;
+	size_t datasize_;
+	size_t capacity_;
+};
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_ARRAY_H__ */
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index f3c7ae546a0..477b667a6fe 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -23,12 +23,13 @@
 #include "atomic_ops.h"
 
 #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
+#define atomic_compare_and_swap_float(p, old_val, new_val) atomic_cas_float((p), (old_val), (new_val))
 
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
 #define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
 
 #define CCL_LOCAL_MEM_FENCE 0
-#define ccl_barrier(flags) (void)0
+#define ccl_barrier(flags) ((void) 0)
 
 #else  /* __KERNEL_GPU__ */
 
@@ -57,6 +58,20 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 	return new_value.float_value;
 }
 
+ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
+                                                      const float old_val, const float new_val)
+{
+	union {
+		unsigned int int_value;
+		float float_value;
+	} new_value, prev_value, result;
+	prev_value.float_value = old_val;
+	new_value.float_value = new_val;
+	result.int_value = atomic_cmpxchg((volatile ccl_global unsigned int *)dest,
+                                       prev_value.int_value, new_value.int_value);
+	return result.float_value;
+}
+
 #define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
 #define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
@@ -75,6 +90,19 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
 #define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
 
+ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
+                                                      const float old_val, const float new_val)
+{
+	union {
+		unsigned int int_value;
+		float float_value;
+	} new_value, prev_value, result;
+	prev_value.float_value = old_val;
+	new_value.float_value = new_val;
+	result.int_value = atomicCAS((unsigned int *)dest, prev_value.int_value,new_value.int_value);
+	return result.float_value;
+}
+
 #define CCL_LOCAL_MEM_FENCE
 #define ccl_barrier(flags) __syncthreads()
 
@@ -82,4 +110,4 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
 
 #endif  /* __KERNEL_GPU__ */
 
-#endif /* __UTIL_ATOMIC_H__ */
+#endif  /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 60d9bb44256..25ef39d39ae 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -44,23 +44,12 @@ struct avxb
 	__forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
 	__forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
 
-	//__forceinline avxb           ( bool  a )
-	//	: m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
-	//__forceinline avxb           ( bool  a, bool  b)
-	//	: m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
-	//__forceinline avxb           ( bool  a, bool  b, bool  c, bool  d)
-	//	: m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
-	//__forceinline avxb(int mask) {
-	//	assert(mask >= 0 && mask < 16);
-	//	m128 = _mm_lookupmask_ps[mask];
-	//}
-
 	////////////////////////////////////////////////////////////////////////////////
 	/// Constants
 	////////////////////////////////////////////////////////////////////////////////
 
 	__forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
-	__forceinline avxb( TrueTy  ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
+	__forceinline avxb( TrueTy  ) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1))) {}
 
 	////////////////////////////////////////////////////////////////////////////////
 	/// Array Access
@@ -97,7 +86,21 @@ __forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^
 ////////////////////////////////////////////////////////////////////////////////
 
 __forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
-__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+__forceinline const avxb operator ==( const avxb& a, const avxb& b )
+{
+#ifdef __KERNEL_AVX2__
+	return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
+#else
+	__m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
+	__m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
+	__m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
+	__m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
+	__m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
+	__m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
+	__m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
+	return _mm256_castsi256_ps(result);
+#endif
+}
 
 __forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
 #if defined(__KERNEL_SSE41__)
@@ -114,47 +117,6 @@ __forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
 __forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
 __forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
 
-#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
-                                                      ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
-__forceinline const avxb shuffle( const avxb& a ) {
-	return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0)));
-}
-
-/*
-template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
-	return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
-	return _mm_movehl_ps(a, a);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
-	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-}
-
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
-	return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
-	return _mm_movehl_ps(b, a);
-}
-
-#if defined(__KERNEL_SSE3__)
-template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
-template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
-#endif
-
-#if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
-template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
-template<size_t dst>             __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
-#endif
-*/
-
 ////////////////////////////////////////////////////////////////////////////////
 /// Reduction Operations
 ////////////////////////////////////////////////////////////////////////////////
@@ -180,7 +142,7 @@ __forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
 
 ccl_device_inline void print_avxb(const char *label, const avxb &a)
 {
-	printf("%s: %df %df %df %df %df %df %df %d\n",
+	printf("%s: %d %d %d %d %d %d %d %d\n",
 	       label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
 }
 
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 5596702ca20..f00c722f25b 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -40,8 +40,8 @@ struct avxf
 	__forceinline avxf(const __m256 a) : m256(a) {}
 	__forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {}
 
-	__forceinline operator const __m256&(void) const { return m256; }
-	__forceinline operator       __m256&(void)       { return m256; }
+	__forceinline operator const __m256&() const { return m256; }
+	__forceinline operator       __m256&()       { return m256; }
 
 	__forceinline avxf          (float a) : m256(_mm256_set1_ps(a)) {}
 
@@ -214,17 +214,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
 #endif
 }
 __forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
 	return _mm256_fmsub_ps(a, b, c);
+#else
+	return (a*b) - c;
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Comparison Operators
 ////////////////////////////////////////////////////////////////////////////////
-#ifdef __KERNEL_AVX2__
 __forceinline const avxb operator <=(const avxf& a, const avxf& b) {
 	return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
 }
-#endif
 
 #endif
 
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index b1bd5be0df3..fe89e398840 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -282,4 +282,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_BOUNDBOX_H__ */
+#endif  /* __UTIL_BOUNDBOX_H__ */
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 826db469d25..e6efc7d73fc 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -240,4 +240,4 @@ ccl_device float4 color_srgb_to_linear_v4(float4 c)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_COLOR_H__ */
+#endif  /* __UTIL_COLOR_H__ */
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index f17f8a560ee..864089bb118 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -182,4 +182,4 @@ std::ostream& operator <<(std::ostream &os,
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_DEBUG_H__ */
+#endif  /* __UTIL_DEBUG_H__ */
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 8bce4aca699..429cfe647ef 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -72,7 +72,7 @@
 #  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
 #    define ATTR_FALLTHROUGH __attribute__((fallthrough))
 #  else
-#    define ATTR_FALLTHROUGH ((void)0)
+#    define ATTR_FALLTHROUGH ((void) 0)
 #  endif
 #endif  /* __KERNEL_GPU__ */
 
@@ -104,14 +104,14 @@ template<typename T> static inline T decltype_helper(T x) { return x; }
 #define CHECK_TYPE(var, type)  {  \
 	TYPEOF(var) *__tmp;           \
 	__tmp = (type *)NULL;         \
-	(void)__tmp;                  \
-} (void)0
+	(void) __tmp;                 \
+} (void) 0
 
 #define CHECK_TYPE_PAIR(var_a, var_b)  {  \
 	TYPEOF(var_a) *__tmp;                 \
 	__tmp = (typeof(var_b) *)NULL;        \
-	(void)__tmp;                          \
-} (void)0
+	(void) __tmp;                          \
+} (void) 0
 #else
 #  define CHECK_TYPE(var, type)
 #  define CHECK_TYPE_PAIR(var_a, var_b)
@@ -128,4 +128,4 @@ template<typename T> static inline T decltype_helper(T x) { return x; }
 #  define util_assert(statement)
 #endif
 
-#endif /* __UTIL_DEFINES_H__ */
+#endif  /* __UTIL_DEFINES_H__ */
diff --git a/intern/cycles/util/util_foreach.h b/intern/cycles/util/util_foreach.h
index 2a74ff0a55d..fd106d58b43 100644
--- a/intern/cycles/util/util_foreach.h
+++ b/intern/cycles/util/util_foreach.h
@@ -21,4 +21,4 @@
 
 #define foreach(x, y) for(x : y)
 
-#endif /* __UTIL_FOREACH_H__ */
+#endif  /* __UTIL_FOREACH_H__ */
diff --git a/intern/cycles/util/util_function.h b/intern/cycles/util/util_function.h
index f3cc00329ad..72c7ce43073 100644
--- a/intern/cycles/util/util_function.h
+++ b/intern/cycles/util/util_function.h
@@ -36,4 +36,4 @@ using std::placeholders::_9;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_FUNCTION_H__ */
+#endif  /* __UTIL_FUNCTION_H__ */
diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp
index 54fa6a80df5..ae1d217c54f 100644
--- a/intern/cycles/util/util_guarded_allocator.cpp
+++ b/intern/cycles/util/util_guarded_allocator.cpp
@@ -35,12 +35,12 @@ void util_guarded_mem_free(size_t n)
 
 /* Public API. */
 
-size_t util_guarded_get_mem_used(void)
+size_t util_guarded_get_mem_used()
 {
 	return global_stats.mem_used;
 }
 
-size_t util_guarded_get_mem_peak(void)
+size_t util_guarded_get_mem_peak()
 {
 	return global_stats.mem_peak;
 }
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 87c1526dee4..2c6f1790fd0 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -47,7 +47,7 @@ public:
 
 	T *allocate(size_t n, const void *hint = 0)
 	{
-		(void)hint;
+		(void) hint;
 		size_t size = n * sizeof(T);
 		util_guarded_mem_alloc(size);
 		if(n == 0) {
@@ -158,8 +158,8 @@ public:
 };
 
 /* Get memory usage and peak from the guarded STL allocator. */
-size_t util_guarded_get_mem_used(void);
-size_t util_guarded_get_mem_peak(void);
+size_t util_guarded_get_mem_used();
+size_t util_guarded_get_mem_peak();
 
 /* Call given function and keep track if it runs out of memory.
  *
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 53b7f2472bd..3868509c21b 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -148,4 +148,4 @@ ccl_device_inline half float_to_half(float f)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_HALF_H__ */
+#endif  /* __UTIL_HALF_H__ */
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index a8a5076fbb3..f343252eaca 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -68,4 +68,4 @@ ccl_device_inline float hash_int_01(uint k)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_HASH_H__ */
+#endif  /* __UTIL_HASH_H__ */
diff --git a/intern/cycles/util/util_ies.cpp b/intern/cycles/util/util_ies.cpp
index e068957325b..e1de2e0c6e4 100644
--- a/intern/cycles/util/util_ies.cpp
+++ b/intern/cycles/util/util_ies.cpp
@@ -21,6 +21,13 @@
 
 CCL_NAMESPACE_BEGIN
 
+// NOTE: For some reason gcc-7.2 does not instantiate this versio of allocator
+// gere (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
+//
+// TODO(sergey): Get to the root of this issue, or confirm this i a compiler
+// issue.
+template class GuardedAllocator<char>;
+
 bool IESFile::load(ustring ies)
 {
 	clear();
diff --git a/intern/cycles/util/util_ies.h b/intern/cycles/util/util_ies.h
index 5933cb3962a..663ad649a9c 100644
--- a/intern/cycles/util/util_ies.h
+++ b/intern/cycles/util/util_ies.h
@@ -58,4 +58,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_IES_H__ */
+#endif  /* __UTIL_IES_H__ */
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index 85bdb0d8050..da5f56271c8 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -102,6 +102,6 @@ inline half util_image_cast_from_float(float value)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_IMAGE_H__ */
+#endif  /* __UTIL_IMAGE_H__ */
 
 #include "util/util_image_impl.h"
diff --git a/intern/cycles/util/util_list.h b/intern/cycles/util/util_list.h
index f555b001186..fcf8e4f5c74 100644
--- a/intern/cycles/util/util_list.h
+++ b/intern/cycles/util/util_list.h
@@ -25,4 +25,4 @@ using std::list;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_LIST_H__ */
+#endif  /* __UTIL_LIST_H__ */
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index f38683bf7de..b0922db32fb 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -45,7 +45,7 @@ void util_logging_init(const char *argv0)
 #endif
 }
 
-void util_logging_start(void)
+void util_logging_start()
 {
 #ifdef WITH_CYCLES_LOGGING
 	using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 5c84b6593d3..f66d7c92dcc 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -41,7 +41,7 @@ public:
 	void operator&(StubStream&) { }
 };
 
-#  define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
+#  define LOG_SUPPRESS() (true) ? ((void) 0) : LogMessageVoidify() & StubStream()
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
 #endif
@@ -52,7 +52,7 @@ struct int2;
 struct float3;
 
 void util_logging_init(const char *argv0);
-void util_logging_start(void);
+void util_logging_start();
 void util_logging_verbosity_set(int verbosity);
 
 std::ostream& operator <<(std::ostream &os,
@@ -62,4 +62,4 @@ std::ostream& operator <<(std::ostream &os,
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_LOGGING_H__ */
+#endif  /* __UTIL_LOGGING_H__ */
diff --git a/intern/cycles/util/util_map.h b/intern/cycles/util/util_map.h
index 3c9288417cf..1952d33ada8 100644
--- a/intern/cycles/util/util_map.h
+++ b/intern/cycles/util/util_map.h
@@ -28,4 +28,4 @@ using std::unordered_map;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MAP_H__ */
+#endif  /* __UTIL_MAP_H__ */
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 52aeb8d8599..6167119f873 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -157,7 +157,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d)
 {
 	return max(max(a,b),max(c,d));
 }
-#endif /* __KERNEL_GPU__ */
+#endif  /* __KERNEL_GPU__ */
 
 ccl_device_inline float min4(float a, float b, float c, float d)
 {
@@ -220,7 +220,31 @@ ccl_device_inline float __uint_as_float(uint i)
 	u.i = i;
 	return u.f;
 }
-#endif /* __KERNEL_OPENCL__ */
+
+ccl_device_inline int4 __float4_as_int4(float4 f)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_castps_si128(f.m128));
+	#else
+	return make_int4(__float_as_int(f.x),
+	                 __float_as_int(f.y),
+	                 __float_as_int(f.z),
+	                 __float_as_int(f.w));
+#endif
+}
+
+ccl_device_inline float4 __int4_as_float4(int4 i)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_castsi128_ps(i.m128));
+#else
+	return make_float4(__int_as_float(i.x),
+	                   __int_as_float(i.y),
+	                   __int_as_float(i.z),
+	                   __int_as_float(i.w));
+#endif
+}
+#endif  /* __KERNEL_OPENCL__ */
 
 /* Versions of functions which are safe for fast math. */
 ccl_device_inline bool isnan_safe(float f)
@@ -615,4 +639,4 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_H__ */
+#endif  /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h
index 79643fe26e3..983855e3e9b 100644
--- a/intern/cycles/util/util_math_cdf.h
+++ b/intern/cycles/util/util_math_cdf.h
@@ -75,4 +75,4 @@ void util_cdf_inverted(const int resolution,
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_H_CDF__ */
+#endif  /* __UTIL_MATH_H_CDF__ */
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index d3960deb3b4..323d40058e5 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c)
 	return a * b + c;
 }
 
+ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
+{
+	return a * b + c;
+}
+
 /*
  * FAST & APPROXIMATE MATH
  *
@@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x)
 	return fast_exp2f(x / M_LN2_F);
 }
 
+#ifndef __KERNEL_GPU__
+ccl_device float4 fast_exp2f4(float4 x)
+{
+	const float4 one = make_float4(1.0f);
+	const float4 limit = make_float4(126.0f);
+	x = clamp(x, -limit, limit);
+	int4 m = make_int4(x);
+	x = one - (one - (x - make_float4(m)));
+	float4 r = make_float4(1.33336498402e-3f);
+	r = madd4(x, r, make_float4(9.810352697968e-3f));
+	r = madd4(x, r, make_float4(5.551834031939e-2f));
+	r = madd4(x, r, make_float4(0.2401793301105f));
+	r = madd4(x, r, make_float4(0.693144857883f));
+	r = madd4(x, r, make_float4(1.0f));
+	return __int4_as_float4(__float4_as_int4(r) + (m << 23));
+}
+
+ccl_device_inline float4 fast_expf4(float4 x)
+{
+	return fast_exp2f4(x / M_LN2_F);
+}
+#endif
+
 ccl_device_inline float fast_exp10(float x)
 {
 	/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 6f9d0855d50..e937509367f 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -224,4 +224,4 @@ ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_FLOAT2_H__ */
+#endif  /* __UTIL_MATH_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 75265c1c9a2..a54a3f3087c 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -419,4 +419,4 @@ ccl_device_inline float3 ensure_finite3(float3 v)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_FLOAT3_H__ */
+#endif  /* __UTIL_MATH_FLOAT3_H__ */
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index aa7e56fefe9..479ccf202ba 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b);
 ccl_device_inline float4 operator-(const float4& a, const float4& b);
 ccl_device_inline float4 operator+=(float4& a, const float4& b);
 ccl_device_inline float4 operator*=(float4& a, const float4& b);
+ccl_device_inline float4 operator*=(float4& a, float f);
 ccl_device_inline float4 operator/=(float4& a, float f);
 
 ccl_device_inline int4 operator<(const float4& a, const float4& b);
@@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx);
 ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
@@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b)
 	return a = a * b;
 }
 
+ccl_device_inline float4 operator*=(float4& a, float f)
+{
+	return a = a * f;
+}
+
 ccl_device_inline float4 operator/=(float4& a, float f)
 {
 	return a = a / f;
@@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
 #endif
 }
 
+ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx)
+{
+	return min(max(a, mn), mx);
+}
+
 ccl_device_inline float4 fabs(const float4& a)
 {
 #ifdef __KERNEL_SSE__
@@ -445,4 +457,4 @@ ccl_device_inline float4 load_float4(const float *v)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_FLOAT4_H__ */
+#endif  /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
index 828c49a131c..dd401d9a091 100644
--- a/intern/cycles/util/util_math_int2.h
+++ b/intern/cycles/util/util_math_int2.h
@@ -74,4 +74,4 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_INT2_H__ */
+#endif  /* __UTIL_MATH_INT2_H__ */
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
index 81b10f31f4a..2f4752f90f1 100644
--- a/intern/cycles/util/util_math_int3.h
+++ b/intern/cycles/util/util_math_int3.h
@@ -113,4 +113,4 @@ ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_INT3_H__ */
+#endif  /* __UTIL_MATH_INT3_H__ */
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
index 79a8c0841e7..763c42318d5 100644
--- a/intern/cycles/util/util_math_int4.h
+++ b/intern/cycles/util/util_math_int4.h
@@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline int4 operator+(const int4& a, const int4& b);
 ccl_device_inline int4 operator+=(int4& a, const int4& b);
 ccl_device_inline int4 operator>>(const int4& a, int i);
+ccl_device_inline int4 operator<<(const int4& a, int i);
+ccl_device_inline int4 operator<(const int4& a, const int4& b);
+ccl_device_inline int4 operator>=(const int4& a, const int4& b);
+ccl_device_inline int4 operator&(const int4& a, const int4& b);
 ccl_device_inline int4 min(int4 a, int4 b);
 ccl_device_inline int4 max(int4 a, int4 b);
 ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
@@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i)
 #endif
 }
 
+ccl_device_inline int4 operator<<(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_slli_epi32(a.m128, i));
+#else
+	return make_int4(a.x << i, a.y << i, a.z << i, a.w << i);
+#endif
+}
+
+ccl_device_inline int4 operator<(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_cmplt_epi32(a.m128, b.m128));
+#else
+	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+ccl_device_inline int4 operator>=(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
+#else
+	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+ccl_device_inline int4 operator&(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+	return int4(_mm_and_si128(a.m128, b.m128));
+#else
+	return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
+#endif
+}
+
 ccl_device_inline int4 min(int4 a, int4 b)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
@@ -116,4 +156,4 @@ ccl_device_inline int4 load_int4(const int *v)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_INT4_H__ */
+#endif  /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
index b5fbb24091f..190c2f5d6b0 100644
--- a/intern/cycles/util/util_math_intersect.h
+++ b/intern/cycles/util/util_math_intersect.h
@@ -219,4 +219,4 @@ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D,
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MATH_INTERSECT_H__ */
+#endif  /* __UTIL_MATH_INTERSECT_H__ */
diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h
index 9023ccee4c2..f8c0115d8ce 100644
--- a/intern/cycles/util/util_md5.h
+++ b/intern/cycles/util/util_md5.h
@@ -58,4 +58,4 @@ string util_md5_string(const string& str);
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_MD5_H__ */
+#endif  /* __UTIL_MD5_H__ */
diff --git a/intern/cycles/util/util_murmurhash.cpp b/intern/cycles/util/util_murmurhash.cpp
new file mode 100644
index 00000000000..68b2f2031be
--- /dev/null
+++ b/intern/cycles/util/util_murmurhash.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is taken from alShaders/Cryptomatte/MurmurHash3.h:
+ *
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "util/util_algorithm.h"
+#include "util/util_murmurhash.h"
+
+#if defined(_MSC_VER)
+#  define ROTL32(x,y) _rotl(x,y)
+#  define ROTL64(x,y) _rotl64(x,y)
+#  define BIG_CONSTANT(x) (x)
+#else
+ccl_device_inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+	return (x << r) | (x >> (32 - r));
+}
+#  define ROTL32(x,y) rotl32(x,y)
+#  define BIG_CONSTANT(x) (x##LLU)
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Block read - if your platform needs to do endian-swapping or can only
+ * handle aligned reads, do the conversion here. */
+ccl_device_inline uint32_t mm_hash_getblock32(const uint32_t *p, int i)
+{
+	return p[i];
+}
+
+/* Finalization mix - force all bits of a hash block to avalanche */
+ccl_device_inline uint32_t mm_hash_fmix32 ( uint32_t h )
+{
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+	return h;
+}
+
+uint32_t util_murmur_hash3(const void *key, int len, uint32_t seed)
+{
+	const uint8_t * data = (const uint8_t*)key;
+	const int nblocks = len / 4;
+
+	uint32_t h1 = seed;
+
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+
+	const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+	for(int i = -nblocks; i; i++) {
+		uint32_t k1 = mm_hash_getblock32(blocks,i);
+
+		k1 *= c1;
+		k1 = ROTL32(k1,15);
+		k1 *= c2;
+
+		h1 ^= k1;
+		h1 = ROTL32(h1,13);
+		h1 = h1 * 5 + 0xe6546b64;
+	}
+
+	const uint8_t *tail = (const uint8_t*)(data + nblocks*4);
+
+	uint32_t k1 = 0;
+
+	switch(len & 3) {
+		case 3:
+			k1 ^= tail[2] << 16;
+			ATTR_FALLTHROUGH;
+		case 2:
+			k1 ^= tail[1] << 8;
+			ATTR_FALLTHROUGH;
+		case 1:
+			k1 ^= tail[0];
+			k1 *= c1;
+			k1 = ROTL32(k1,15);
+			k1 *= c2;
+			h1 ^= k1;
+	}
+
+	h1 ^= len;
+	h1 = mm_hash_fmix32(h1);
+	return h1;
+}
+
+/* This is taken from the cryptomatte specification 1.0 */
+float util_hash_to_float(uint32_t hash)
+{
+	uint32_t mantissa = hash & (( 1 << 23) - 1);
+	uint32_t exponent = (hash >> 23) & ((1 << 8) - 1);
+	exponent = max(exponent, (uint32_t) 1);
+	exponent = min(exponent, (uint32_t) 254);
+	exponent = exponent << 23;
+	uint32_t sign = (hash >> 31);
+	sign = sign << 31;
+	uint32_t float_bits = sign | exponent | mantissa;
+	float f;
+	memcpy(&f, &float_bits, sizeof(uint32_t));
+	return f;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_murmurhash.h b/intern/cycles/util/util_murmurhash.h
new file mode 100644
index 00000000000..3e7897d3ae6
--- /dev/null
+++ b/intern/cycles/util/util_murmurhash.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef __UTIL_MURMURHASH_H__
+#define __UTIL_MURMURHASH_H__
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+uint32_t util_murmur_hash3(const void *key, int len, uint32_t seed);
+float util_hash_to_float(uint32_t hash);
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_MURMURHASH_H__ */
diff --git a/intern/cycles/util/util_opengl.h b/intern/cycles/util/util_opengl.h
index 0b5462e0a09..2ca7b7e4c87 100644
--- a/intern/cycles/util/util_opengl.h
+++ b/intern/cycles/util/util_opengl.h
@@ -28,4 +28,4 @@
 #  define mxMakeCurrentContext(x) (x)
 #endif
 
-#endif /* __UTIL_OPENGL_H__ */
+#endif  /* __UTIL_OPENGL_H__ */
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 3b3627242d5..5267bd9a97a 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -68,4 +68,4 @@
 
 #endif
 
-#endif /* __UTIL_OPTIMIZATION_H__ */
+#endif  /* __UTIL_OPTIMIZATION_H__ */
diff --git a/intern/cycles/util/util_param.h b/intern/cycles/util/util_param.h
index 246b5cb7d63..4453c66aae2 100644
--- a/intern/cycles/util/util_param.h
+++ b/intern/cycles/util/util_param.h
@@ -30,4 +30,4 @@ OIIO_NAMESPACE_USING
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_PARAM_H__ */
+#endif  /* __UTIL_PARAM_H__ */
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 51b7944705e..93080a6c80c 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -614,7 +614,7 @@ bool path_exists(const string& path)
 		return 0;
 	}
 	return st.st_mode != 0;
-#endif /* _WIN32 */
+#endif  /* _WIN32 */
 }
 
 bool path_is_directory(const string& path)
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 3ef15c5c09a..4ed9ebd60ff 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -114,7 +114,7 @@ public:
 		return cancel_message;
 	}
 
-	void set_cancel_callback(function<void(void)> function)
+	void set_cancel_callback(function<void()> function)
 	{
 		cancel_cb = function;
 	}
@@ -323,7 +323,7 @@ public:
 		}
 	}
 
-	void set_update_callback(function<void(void)> function)
+	void set_update_callback(function<void()> function)
 	{
 		update_cb = function;
 	}
@@ -331,8 +331,8 @@ public:
 protected:
 	thread_mutex progress_mutex;
 	thread_mutex update_mutex;
-	function<void(void)> update_cb;
-	function<void(void)> cancel_cb;
+	function<void()> update_cb;
+	function<void()> cancel_cb;
 
 	/* pixel_samples counts how many samples have been rendered over all pixel, not just per pixel.
 	 * This makes the progress estimate more accurate when tiles with different sizes are used.
@@ -365,4 +365,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_PROGRESS_H__ */
+#endif  /* __UTIL_PROGRESS_H__ */
diff --git a/intern/cycles/util/util_projection.h b/intern/cycles/util/util_projection.h
index 26b4843928c..d1af013ae3a 100644
--- a/intern/cycles/util/util_projection.h
+++ b/intern/cycles/util/util_projection.h
@@ -169,8 +169,8 @@ ccl_device_inline ProjectionTransform projection_orthographic(float znear, float
 	return ProjectionTransform(t);
 }
 
-#endif /* __KERNEL_GPU__ */
+#endif  /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_PROJECTION_H__ */
+#endif  /* __UTIL_PROJECTION_H__ */
diff --git a/intern/cycles/util/util_queue.h b/intern/cycles/util/util_queue.h
index 622f4fe3e47..0a2b7718f57 100644
--- a/intern/cycles/util/util_queue.h
+++ b/intern/cycles/util/util_queue.h
@@ -25,4 +25,4 @@ using std::queue;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_LIST_H__ */
+#endif  /* __UTIL_LIST_H__ */
diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h
index d13baefe85e..389669acf2e 100644
--- a/intern/cycles/util/util_rect.h
+++ b/intern/cycles/util/util_rect.h
@@ -69,4 +69,4 @@ ccl_device_inline int rect_size(int4 rect)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_RECT_H__ */
+#endif  /* __UTIL_RECT_H__ */
diff --git a/intern/cycles/util/util_set.h b/intern/cycles/util/util_set.h
index 298e1f7729a..a9c56bb4919 100644
--- a/intern/cycles/util/util_set.h
+++ b/intern/cycles/util/util_set.h
@@ -31,4 +31,4 @@ using std::unordered_set;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_SET_H__ */
+#endif  /* __UTIL_SET_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index cc7f436c8fe..565ea768089 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -219,9 +219,9 @@ __forceinline size_t __bscf(size_t& v)
   return i;
 }
 
-#endif /* __KERNEL_64_BIT__ */
+#endif  /* __KERNEL_64_BIT__ */
 
-#else /* _WIN32 */
+#else  /* _WIN32 */
 
 __forceinline unsigned int __popcnt(unsigned int in) {
   int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r;
@@ -344,7 +344,7 @@ __forceinline size_t __bscf(size_t& v)
 }
 #endif
 
-#endif /* _WIN32 */
+#endif  /* _WIN32 */
 
 /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
  * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
@@ -442,7 +442,7 @@ __forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags)
   return value;
 }
 
-#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
+#endif  /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #else  /* __KERNEL_SSE2__ */
 
@@ -470,7 +470,7 @@ ccl_device_inline int __bsr(int value)
 	return bit;
 }
 
-#endif /* __KERNEL_SSE2__ */
+#endif  /* __KERNEL_SSE2__ */
 
 /* quiet unused define warnings */
 #if defined(__KERNEL_SSE2__)  || \
@@ -484,6 +484,6 @@ ccl_device_inline int __bsr(int value)
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_GPU__ */
+#endif  /* __KERNEL_GPU__ */
 
-#endif /* __UTIL_SIMD_TYPES_H__ */
+#endif  /* __UTIL_SIMD_TYPES_H__ */
diff --git a/intern/cycles/util/util_sky_model.h b/intern/cycles/util/util_sky_model.h
index 237e4e61bf5..2e593f58c39 100644
--- a/intern/cycles/util/util_sky_model.h
+++ b/intern/cycles/util/util_sky_model.h
@@ -437,6 +437,6 @@ double arhosekskymodel_solar_radiance(ArHosekSkyModelState* state,
                                       double wavelength);
 
 
-#endif // _SKY_MODEL_H_
+#endif  // _SKY_MODEL_H_
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 115b133c662..f6810505126 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -177,7 +177,7 @@ __forceinline size_t movemask( const sseb& a ) { return _mm_movemask_ps(a); }
 
 ccl_device_inline void print_sseb(const char *label, const sseb &a)
 {
-	printf("%s: %df %df %df %d\n",
+	printf("%s: %d %d %d %d\n",
 	       label, a[0], a[1], a[2], a[3]);
 }
 
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index b5623860e33..66670c9a779 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -44,8 +44,8 @@ struct ssef
 	__forceinline ssef& operator=(const ssef& other) { m128 = other.m128; return *this; }
 
 	__forceinline ssef(const __m128 a) : m128(a) {}
-	__forceinline operator const __m128&(void) const { return m128; }
-	__forceinline operator       __m128&(void)       { return m128; }
+	__forceinline operator const __m128&() const { return m128; }
+	__forceinline operator       __m128&()       { return m128; }
 
 	__forceinline ssef          (float a) : m128(_mm_set1_ps(a)) {}
 	__forceinline ssef          (float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d)) {}
@@ -517,12 +517,12 @@ ccl_device_inline float len3(const ssef& a)
 /* faster version for SSSE3 */
 typedef ssei shuffle_swap_t;
 
-ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity()
 {
 	return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap()
 {
 	return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
 }
@@ -537,12 +537,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s
 /* somewhat slower version for SSE2 */
 typedef int shuffle_swap_t;
 
-ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity()
 {
 	return 0;
 }
 
-ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap()
 {
 	return 1;
 }
diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h
index 79a535bd170..4e978e18bee 100644
--- a/intern/cycles/util/util_stack_allocator.h
+++ b/intern/cycles/util/util_stack_allocator.h
@@ -53,7 +53,7 @@ public:
 
 	T *allocate(size_t n, const void *hint = 0)
 	{
-		(void)hint;
+		(void) hint;
 		if(n == 0) {
 			return NULL;
 		}
@@ -164,4 +164,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif  /* __UTIL_GUARDED_ALLOCATOR_H__ */
+#endif  /* __UTIL_STACK_ALLOCATOR_H__ */
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index dc3cb3f6ecc..b1c6c374693 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -47,4 +47,4 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_STATIC_ASSERT_H__ */
+#endif  /* __UTIL_STATIC_ASSERT_H__ */
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index 7667f58eb7d..0ba58422a67 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -44,4 +44,4 @@ public:
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_STATS_H__ */
+#endif  /* __UTIL_STATS_H__ */
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index 3a4f4398158..f17112c30d2 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -74,4 +74,4 @@ string string_human_readable_number(size_t num);
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_STRING_H__ */
+#endif  /* __UTIL_STRING_H__ */
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 1b039888452..34f428f111c 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -52,14 +52,14 @@ int system_cpu_group_thread_count(int group)
 	util_windows_init_numa_groups();
 	return GetActiveProcessorCount(group);
 #elif defined(__APPLE__)
-	(void)group;
+	(void) group;
 	int count;
 	size_t len = sizeof(count);
 	int mib[2] = { CTL_HW, HW_NCPU };
 	sysctl(mib, 2, &count, &len, NULL, 0);
 	return count;
 #else
-	(void)group;
+	(void) group;
 	return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 42ad72356b9..241ac897157 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -46,4 +46,4 @@ size_t system_physical_ram();
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_SYSTEM_H__ */
+#endif  /* __UTIL_SYSTEM_H__ */
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index f752e81128d..233cfe33305 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -89,4 +89,4 @@ typedef struct TextureInfo {
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_TEXTURE_H__ */
+#endif  /* __UTIL_TEXTURE_H__ */
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 16a8591a8a9..37d8bdbd4b0 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -21,7 +21,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-thread::thread(function<void(void)> run_cb, int group)
+thread::thread(function<void()> run_cb, int group)
   : run_cb_(run_cb),
     joined_(false),
 	group_(group)
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index f39fcfb4279..6250bb95dcf 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -46,14 +46,14 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
-	thread(function<void(void)> run_cb, int group = -1);
+	thread(function<void()> run_cb, int group = -1);
 	~thread();
 
 	static void *run(void *arg);
 	bool join();
 
 protected:
-	function<void(void)> run_cb_;
+	function<void()> run_cb_;
 	std::thread thread_;
 	bool joined_;
 	int group_;
@@ -138,4 +138,4 @@ protected:
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_THREAD_H__ */
+#endif  /* __UTIL_THREAD_H__ */
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index e781f85dded..e4cadd3e81a 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -424,6 +424,31 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
 
 #ifndef __KERNEL_GPU__
 
+#ifdef WITH_EMBREE
+ccl_device void transform_motion_array_interpolate_straight(Transform *tfm,
+                                                            const ccl_global DecomposedTransform *motion,
+                                                            uint numsteps,
+                                                            float time)
+{
+	/* Figure out which steps we need to interpolate. */
+	int maxstep = numsteps - 1;
+	int step = min((int)(time*maxstep), maxstep - 1);
+	float t = time * maxstep - step;
+
+	const ccl_global DecomposedTransform *a = motion + step;
+	const ccl_global DecomposedTransform *b = motion + step + 1;
+	Transform step1, step2;
+
+	transform_compose(&step1, a);
+	transform_compose(&step2, b);
+
+	/* matrix lerp */
+	tfm->x = (1.0f - t) * step1.x + t * step2.x;
+	tfm->y = (1.0f - t) * step1.y + t * step2.y;
+	tfm->z = (1.0f - t) * step1.z + t * step2.z;
+}
+#endif
+
 class BoundBox2D;
 
 ccl_device_inline bool operator==(const DecomposedTransform& A, const DecomposedTransform& B)
@@ -470,4 +495,4 @@ OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_TRANSFORM_H__ */
+#endif  /* __UTIL_TRANSFORM_H__ */
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 96c549b9be5..535048d8f8c 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -156,4 +156,4 @@ CCL_NAMESPACE_END
 #endif
 #endif
 
-#endif /* __UTIL_TYPES_H__ */
+#endif  /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h
index 28146ad04f7..ed2300e7996 100644
--- a/intern/cycles/util/util_types_float3.h
+++ b/intern/cycles/util/util_types_float3.h
@@ -35,8 +35,8 @@ struct ccl_try_align(16) float3 {
 	__forceinline float3(const float3& a);
 	__forceinline explicit float3(const __m128& a);
 
-	__forceinline operator const __m128&(void) const;
-	__forceinline operator __m128&(void);
+	__forceinline operator const __m128&() const;
+	__forceinline operator __m128&();
 
 	__forceinline float3& operator =(const float3& a);
 #else  /* __KERNEL_SSE__ */
diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h
index 45f61767d3f..2e840a5c399 100644
--- a/intern/cycles/util/util_types_float3_impl.h
+++ b/intern/cycles/util/util_types_float3_impl.h
@@ -43,12 +43,12 @@ __forceinline float3::float3(const __m128& a)
 {
 }
 
-__forceinline float3::operator const __m128&(void) const
+__forceinline float3::operator const __m128&() const
 {
 	return m128;
 }
 
-__forceinline float3::operator __m128&(void)
+__forceinline float3::operator __m128&()
 {
 	return m128;
 }
diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h
index 154391f6881..5c10d483c2e 100644
--- a/intern/cycles/util/util_types_float4.h
+++ b/intern/cycles/util/util_types_float4.h
@@ -36,8 +36,8 @@ struct ccl_try_align(16) float4 {
 	__forceinline float4();
 	__forceinline explicit float4(const __m128& a);
 
-	__forceinline operator const __m128&(void) const;
-	__forceinline operator __m128&(void);
+	__forceinline operator const __m128&() const;
+	__forceinline operator __m128&();
 
 	__forceinline float4& operator =(const float4& a);
 
diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h
index a49fac65b10..a83148031f1 100644
--- a/intern/cycles/util/util_types_float4_impl.h
+++ b/intern/cycles/util/util_types_float4_impl.h
@@ -38,12 +38,12 @@ __forceinline float4::float4(const __m128& a)
 {
 }
 
-__forceinline float4::operator const __m128&(void) const
+__forceinline float4::operator const __m128&() const
 {
 	return m128;
 }
 
-__forceinline float4::operator __m128&(void)
+__forceinline float4::operator __m128&()
 {
 	return m128;
 }
diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h
index 64ec5a71355..08720b8ff48 100644
--- a/intern/cycles/util/util_types_float8.h
+++ b/intern/cycles/util/util_types_float8.h
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_GPU__
 
-struct ccl_try_align(16) float8 {
+struct ccl_try_align(32) float8 {
 #ifdef __KERNEL_AVX2__
 	union {
 		__m256 m256;
@@ -48,8 +48,8 @@ struct ccl_try_align(16) float8 {
 	__forceinline float8(const float8& a);
 	__forceinline explicit float8(const __m256& a);
 
-	__forceinline operator const __m256&(void) const;
-	__forceinline operator __m256&(void);
+	__forceinline operator const __m256&() const;
+	__forceinline operator __m256&();
 
 	__forceinline float8& operator =(const float8& a);
 
diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h
index ebf8260bc7c..84fe233c334 100644
--- a/intern/cycles/util/util_types_float8_impl.h
+++ b/intern/cycles/util/util_types_float8_impl.h
@@ -55,12 +55,12 @@ __forceinline float8::float8(const __m256& f)
 {
 }
 
-__forceinline float8::operator const __m256&(void) const
+__forceinline float8::operator const __m256&() const
 {
 	return m256;
 }
 
-__forceinline float8::operator __m256&(void)
+__forceinline float8::operator __m256&()
 {
 	return m256;
 }
diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h
index 9d43b201c02..f68074b982b 100644
--- a/intern/cycles/util/util_types_int3.h
+++ b/intern/cycles/util/util_types_int3.h
@@ -35,8 +35,8 @@ struct ccl_try_align(16) int3 {
 	__forceinline int3(const int3& a);
 	__forceinline explicit int3(const __m128i& a);
 
-	__forceinline operator const __m128i&(void) const;
-	__forceinline operator __m128i&(void);
+	__forceinline operator const __m128i&() const;
+	__forceinline operator __m128i&();
 
 	__forceinline int3& operator =(const int3& a);
 #else  /* __KERNEL_SSE__ */
diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h
index ada50c4812c..1b195ca753f 100644
--- a/intern/cycles/util/util_types_int3_impl.h
+++ b/intern/cycles/util/util_types_int3_impl.h
@@ -43,12 +43,12 @@ __forceinline int3::int3(const int3& a)
 {
 }
 
-__forceinline int3::operator const __m128i&(void) const
+__forceinline int3::operator const __m128i&() const
 {
 	return m128;
 }
 
-__forceinline int3::operator __m128i&(void)
+__forceinline int3::operator __m128i&()
 {
 	return m128;
 }
diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h
index cdd0ecbdae5..52e6fed8c14 100644
--- a/intern/cycles/util/util_types_int4.h
+++ b/intern/cycles/util/util_types_int4.h
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
 #ifndef __KERNEL_GPU__
 
 struct float3;
+struct float4;
 
 struct ccl_try_align(16) int4 {
 #ifdef __KERNEL_SSE__
@@ -38,8 +39,8 @@ struct ccl_try_align(16) int4 {
 	__forceinline int4(const int4& a);
 	__forceinline explicit int4(const __m128i& a);
 
-	__forceinline operator const __m128i&(void) const;
-	__forceinline operator __m128i&(void);
+	__forceinline operator const __m128i&() const;
+	__forceinline operator __m128i&();
 
 	__forceinline int4& operator=(const int4& a);
 #else  /* __KERNEL_SSE__ */
@@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 {
 ccl_device_inline int4 make_int4(int i);
 ccl_device_inline int4 make_int4(int x, int y, int z, int w);
 ccl_device_inline int4 make_int4(const float3& f);
+ccl_device_inline int4 make_int4(const float4& f);
 ccl_device_inline void print_int4(const char *label, const int4& a);
 #endif  /* __KERNEL_GPU__ */
 
diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h
index 07cdc88f2dc..c058f86c400 100644
--- a/intern/cycles/util/util_types_int4_impl.h
+++ b/intern/cycles/util/util_types_int4_impl.h
@@ -43,12 +43,12 @@ __forceinline int4::int4(const __m128i& a)
 {
 }
 
-__forceinline int4::operator const __m128i&(void) const
+__forceinline int4::operator const __m128i&() const
 {
 	return m128;
 }
 
-__forceinline int4::operator __m128i&(void)
+__forceinline int4::operator __m128i&()
 {
 	return m128;
 }
@@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f)
 	return a;
 }
 
+ccl_device_inline int4 make_int4(const float4& f)
+{
+#ifdef __KERNEL_SSE__
+	int4 a(_mm_cvtps_epi32(f.m128));
+#else
+	int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+	return a;
+}
+
 ccl_device_inline void print_int4(const char *label, const int4& a)
 {
 	printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 0b33221ad4d..18fa231d6e7 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -17,8 +17,6 @@
 #ifndef __UTIL_VECTOR_H__
 #define __UTIL_VECTOR_H__
 
-/* Vector */
-
 #include <cassert>
 #include <cstring>
 #include <vector>
@@ -29,12 +27,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Vector
- *
- * Own subclass-ed vestion of std::vector. Subclass is needed because:
+/* Own subclass-ed vestion of std::vector. Subclass is needed because:
  *
  * - Use own allocator which keeps track of used/peak memory.
- *
  * - Have method to ensure capacity is re-set to 0.
  */
 template<typename value_type,
@@ -42,30 +37,16 @@ template<typename value_type,
 class vector : public std::vector<value_type, allocator_type>
 {
 public:
-	/* Default constructor. */
-	explicit vector() : std::vector<value_type, allocator_type>() {  }
-
-	/* Fill constructor. */
-	explicit vector(size_t n, const value_type& val = value_type())
-		: std::vector<value_type, allocator_type>(n, val) {  }
-
-	/* Range constructor. */
-	template <class InputIterator>
-	vector(InputIterator first, InputIterator last)
-		: std::vector<value_type, allocator_type>(first, last) {  }
-
-	/* Copy constructor. */
-	vector(const vector &x) : std::vector<value_type, allocator_type>(x) {  }
+	typedef std::vector<value_type, allocator_type> BaseClass;
 
-	void shrink_to_fit(void)
-	{
-		std::vector<value_type, allocator_type>::shrink_to_fit();
-	}
+	/* Inherit all constructors from base class. */
+	using BaseClass::vector;
 
-	void free_memory(void)
+	/* Try as hard as possible to use zero memory. */
+	void free_memory()
 	{
-		std::vector<value_type, allocator_type>::resize(0);
-		shrink_to_fit();
+		BaseClass::resize(0);
+		BaseClass::shrink_to_fit();
 	}
 
 	/* Some external API might demand working with std::vector. */
@@ -75,265 +56,6 @@ public:
 	}
 };
 
-/* Array
- *
- * Simplified version of vector, serving multiple purposes:
- * - somewhat faster in that it does not clear memory on resize/alloc,
- *   this was actually showing up in profiles quite significantly. it
- *   also does not run any constructors/destructors
- * - if this is used, we are not tempted to use inefficient operations
- * - aligned allocation for CPU native data types */
-
-template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES>
-class array
-{
-public:
-	array()
-	: data_(NULL),
-	  datasize_(0),
-	  capacity_(0)
-	{}
-
-	explicit array(size_t newsize)
-	{
-		if(newsize == 0) {
-			data_ = NULL;
-			datasize_ = 0;
-			capacity_ = 0;
-		}
-		else {
-			data_ = mem_allocate(newsize);
-			datasize_ = newsize;
-			capacity_ = datasize_;
-		}
-	}
-
-	array(const array& from)
-	{
-		if(from.datasize_ == 0) {
-			data_ = NULL;
-			datasize_ = 0;
-			capacity_ = 0;
-		}
-		else {
-			data_ = mem_allocate(from.datasize_);
-			memcpy(data_, from.data_, from.datasize_*sizeof(T));
-			datasize_ = from.datasize_;
-			capacity_ = datasize_;
-		}
-	}
-
-	array& operator=(const array& from)
-	{
-		if(this != &from) {
-			resize(from.size());
-			memcpy((void*)data_, from.data_, datasize_*sizeof(T));
-		}
-
-		return *this;
-	}
-
-	array& operator=(const vector<T>& from)
-	{
-		resize(from.size());
-
-		if(from.size() > 0) {
-			memcpy(data_, &from[0], datasize_*sizeof(T));
-		}
-
-		return *this;
-	}
-
-	~array()
-	{
-		mem_free(data_, capacity_);
-	}
-
-	bool operator==(const array<T>& other) const
-	{
-		if(datasize_ != other.datasize_) {
-			return false;
-		}
-
-		return memcmp(data_, other.data_, datasize_*sizeof(T)) == 0;
-	}
-
-	bool operator!=(const array<T>& other) const
-	{
-		return !(*this == other);
-	}
-
-	void steal_data(array& from)
-	{
-		if(this != &from) {
-			clear();
-
-			data_ = from.data_;
-			datasize_ = from.datasize_;
-			capacity_ = from.capacity_;
-
-			from.data_ = NULL;
-			from.datasize_ = 0;
-			from.capacity_ = 0;
-		}
-	}
-
-	T *steal_pointer()
-	{
-		T *ptr = data_;
-		data_ = NULL;
-		clear();
-		return ptr;
-	}
-
-	T* resize(size_t newsize)
-	{
-		if(newsize == 0) {
-			clear();
-		}
-		else if(newsize != datasize_) {
-			if(newsize > capacity_) {
-				T *newdata = mem_allocate(newsize);
-				if(newdata == NULL) {
-					/* Allocation failed, likely out of memory. */
-					clear();
-					return NULL;
-				}
-				else if(data_ != NULL) {
-					memcpy((void *)newdata,
-					       data_,
-					       ((datasize_ < newsize)? datasize_: newsize)*sizeof(T));
-					mem_free(data_, capacity_);
-				}
-				data_ = newdata;
-				capacity_ = newsize;
-			}
-			datasize_ = newsize;
-		}
-		return data_;
-	}
-
-	T* resize(size_t newsize, const T& value)
-	{
-		size_t oldsize = size();
-		resize(newsize);
-
-		for(size_t i = oldsize; i < size(); i++) {
-			data_[i] = value;
-		}
-
-		return data_;
-	}
-
-	void clear()
-	{
-		if(data_ != NULL) {
-			mem_free(data_, capacity_);
-			data_ = NULL;
-		}
-		datasize_ = 0;
-		capacity_ = 0;
-	}
-
-	size_t empty() const
-	{
-		return datasize_ == 0;
-	}
-
-	size_t size() const
-	{
-		return datasize_;
-	}
-
-	T* data()
-	{
-		return data_;
-	}
-
-	const T* data() const
-	{
-		return data_;
-	}
-
-	T& operator[](size_t i) const
-	{
-		assert(i < datasize_);
-		return data_[i];
-	}
-
-	void reserve(size_t newcapacity)
-	{
-		if(newcapacity > capacity_) {
-			T *newdata = mem_allocate(newcapacity);
-			if(data_ != NULL) {
-				memcpy(newdata, data_, ((datasize_ < newcapacity)? datasize_: newcapacity)*sizeof(T));
-				mem_free(data_, capacity_);
-			}
-			data_ = newdata;
-			capacity_ = newcapacity;
-		}
-	}
-
-	size_t capacity() const
-	{
-		return capacity_;
-	}
-
-	// do not use this method unless you are sure the code is not performance critical
-	void push_back_slow(const T& t)
-	{
-		if(capacity_ == datasize_)
-		{
-			reserve(datasize_ == 0 ? 1 : (size_t)((datasize_ + 1) * 1.2));
-		}
-
-		data_[datasize_++] = t;
-	}
-
-	void push_back_reserved(const T& t)
-	{
-		assert(datasize_ < capacity_);
-		push_back_slow(t);
-	}
-
-	void append(const array<T>& from)
-	{
-		if(from.size()) {
-			size_t old_size = size();
-			resize(old_size + from.size());
-			memcpy(data_ + old_size, from.data(), sizeof(T) * from.size());
-		}
-	}
-
-protected:
-	inline T* mem_allocate(size_t N)
-	{
-		if(N == 0) {
-			return NULL;
-		}
-		T *mem = (T*)util_aligned_malloc(sizeof(T)*N, alignment);
-		if(mem != NULL) {
-			util_guarded_mem_alloc(sizeof(T)*N);
-		}
-		else {
-			throw std::bad_alloc();
-		}
-		return mem;
-	}
-
-	inline void mem_free(T *mem, size_t N)
-	{
-		if(mem != NULL) {
-			util_guarded_mem_free(sizeof(T)*N);
-			util_aligned_free(mem);
-		}
-	}
-
-	T *data_;
-	size_t datasize_;
-	size_t capacity_;
-};
-
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_VECTOR_H__ */
+#endif  /* __UTIL_VECTOR_H__ */
diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h
index 112255f447b..980c5a269e6 100644
--- a/intern/cycles/util/util_version.h
+++ b/intern/cycles/util/util_version.h
@@ -34,4 +34,4 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_VERSION_H__ */
+#endif  /* __UTIL_VERSION_H__ */
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 3836cc86ee0..9dffd7a80bd 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -215,7 +215,7 @@ static void view_motion(int x, int y)
 	V.mouseY = y;
 }
 
-static void view_idle(void)
+static void view_idle()
 {
 	if(V.redraw) {
 		V.redraw = false;
diff --git a/intern/cycles/util/util_view.h b/intern/cycles/util/util_view.h
index e603e605776..ae50b098b39 100644
--- a/intern/cycles/util/util_view.h
+++ b/intern/cycles/util/util_view.h
@@ -22,10 +22,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-typedef void (*ViewInitFunc)(void);
-typedef void (*ViewExitFunc)(void);
+typedef void (*ViewInitFunc)();
+typedef void (*ViewExitFunc)();
 typedef void (*ViewResizeFunc)(int width, int height);
-typedef void (*ViewDisplayFunc)(void);
+typedef void (*ViewDisplayFunc)();
 typedef void (*ViewKeyboardFunc)(unsigned char key);
 typedef void (*ViewMotionFunc)(int x, int y, int button);
 
@@ -40,4 +40,4 @@ void view_redraw();
 
 CCL_NAMESPACE_END
 
-#endif /*__UTIL_VIEW_H__*/
+#endif  /*__UTIL_VIEW_H__*/
diff --git a/intern/cycles/util/util_windows.h b/intern/cycles/util/util_windows.h
index 3b23ce8a3cb..9b9268fed7a 100644
--- a/intern/cycles/util/util_windows.h
+++ b/intern/cycles/util/util_windows.h
@@ -56,4 +56,4 @@ CCL_NAMESPACE_END
 
 #endif  /* WIN32 */
 
-#endif /* __UTIL_WINDOWS_H__ */
+#endif  /* __UTIL_WINDOWS_H__ */
diff --git a/intern/cycles/util/util_xml.h b/intern/cycles/util/util_xml.h
index 6f06f17937b..c8a3a495f30 100644
--- a/intern/cycles/util/util_xml.h
+++ b/intern/cycles/util/util_xml.h
@@ -38,4 +38,4 @@ using PUGIXML_NAMESPACE::xml_parse_result;
 
 CCL_NAMESPACE_END
 
-#endif /* __UTIL_XML_H__ */
+#endif  /* __UTIL_XML_H__ */
diff --git a/intern/elbeem/extern/LBM_fluidsim.h b/intern/elbeem/extern/LBM_fluidsim.h
index 374e11d1c06..05ac143c7a1 100644
--- a/intern/elbeem/extern/LBM_fluidsim.h
+++ b/intern/elbeem/extern/LBM_fluidsim.h
@@ -1,7 +1,4 @@
 /*
- * BKE_fluidsim.h 
- *	
- *
  * ***** BEGIN GPL LICENSE BLOCK *****
  *
  * This program is free software; you can redistribute it and/or
@@ -44,5 +41,3 @@ int performElbeemSimulation(char *cfgfilename);
 
 
 #endif
-
-
diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h
index 86e64b7eeb0..bc5c67f4bed 100644
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@@ -908,6 +908,11 @@ extern int GHOST_SupportsNativeDialogs(void);
 extern int GHOST_UseNativePixels(void);
 
 /**
+ * Focus window after opening, or put them in the background.
+ */
+extern void GHOST_UseWindowFocus(int use_focus);
+
+/**
  * If window was opened using native pixel size, it returns scaling factor.
  */
 extern float GHOST_GetNativePixelSize(GHOST_WindowHandle windowhandle);
diff --git a/intern/ghost/GHOST_ISystem.h b/intern/ghost/GHOST_ISystem.h
index be2a94bd508..9b017c2da38 100644
--- a/intern/ghost/GHOST_ISystem.h
+++ b/intern/ghost/GHOST_ISystem.h
@@ -306,6 +306,11 @@ public:
 	 */
 	virtual bool useNativePixel(void) = 0;
 
+	/**
+	 * Focus window after opening, or put them in the background.
+	 */
+	virtual void useWindowFocus(const bool use_focus) = 0;
+
 	/***************************************************************************************
 	 * Event management functionality
 	 ***************************************************************************************/
diff --git a/intern/ghost/intern/GHOST_C-api.cpp b/intern/ghost/intern/GHOST_C-api.cpp
index 797fb16dd6f..e60a8a9d3f3 100644
--- a/intern/ghost/intern/GHOST_C-api.cpp
+++ b/intern/ghost/intern/GHOST_C-api.cpp
@@ -911,6 +911,12 @@ int GHOST_UseNativePixels(void)
 	return system->useNativePixel();
 }
 
+void GHOST_UseWindowFocus(int use_focus)
+{
+	GHOST_ISystem *system = GHOST_ISystem::getSystem();
+	return system->useWindowFocus(use_focus);
+}
+
 float GHOST_GetNativePixelSize(GHOST_WindowHandle windowhandle)
 {
 	GHOST_IWindow *window = (GHOST_IWindow *) windowhandle;
diff --git a/intern/ghost/intern/GHOST_DropTargetWin32.h b/intern/ghost/intern/GHOST_DropTargetWin32.h
index 3d7be45799f..5dcefcaf25e 100644
--- a/intern/ghost/intern/GHOST_DropTargetWin32.h
+++ b/intern/ghost/intern/GHOST_DropTargetWin32.h
@@ -60,7 +60,7 @@ public:
 	 *  - Determining the effect of the drop on the target application.
 	 *  - Incorporating any valid dropped data when the drop occurs.
 	 *  - Communicating target feedback to the source so the source application
-	 *	  can provide appropriate visual feedback such as setting the cursor.
+	 *    can provide appropriate visual feedback such as setting the cursor.
 	 *  - Implementing drag scrolling.
 	 *  - Registering and revoking its application windows as drop targets.
 	 *
diff --git a/intern/ghost/intern/GHOST_EventTrackpad.h b/intern/ghost/intern/GHOST_EventTrackpad.h
index faf0f1697d0..25988090c01 100644
--- a/intern/ghost/intern/GHOST_EventTrackpad.h
+++ b/intern/ghost/intern/GHOST_EventTrackpad.h
@@ -20,8 +20,8 @@
  *
  * The Original Code is: all of this file.
  *
- * Contributor(s):  James Deery		11/2009
- *					Damien Plisson	12/2009
+ * Contributor(s): James Deery     11/2009
+ *                 Damien Plisson  12/2009
  *
  * ***** END GPL LICENSE BLOCK *****
  */
diff --git a/intern/ghost/intern/GHOST_System.cpp b/intern/ghost/intern/GHOST_System.cpp
index 0629eacc3ff..39f915bb0c2 100644
--- a/intern/ghost/intern/GHOST_System.cpp
+++ b/intern/ghost/intern/GHOST_System.cpp
@@ -48,6 +48,7 @@
 
 GHOST_System::GHOST_System()
     : m_nativePixel(false),
+      m_windowFocus(true),
       m_displayManager(NULL),
       m_timerManager(NULL),
       m_windowManager(NULL),
@@ -390,3 +391,8 @@ bool GHOST_System::useNativePixel(void)
 	m_nativePixel = true;
 	return 1;
 }
+
+void GHOST_System::useWindowFocus(const bool use_focus)
+{
+	m_windowFocus = use_focus;
+}
diff --git a/intern/ghost/intern/GHOST_System.h b/intern/ghost/intern/GHOST_System.h
index 464d9269f28..ee3c30c35b4 100644
--- a/intern/ghost/intern/GHOST_System.h
+++ b/intern/ghost/intern/GHOST_System.h
@@ -177,6 +177,12 @@ public:
 	bool useNativePixel(void);
 	bool m_nativePixel;
 
+	/**
+	 * Focus window after opening, or put them in the background.
+	 */
+	void useWindowFocus(const bool use_focus);
+	bool m_windowFocus;
+
 	/***************************************************************************************
 	 * Event management functionality
 	 ***************************************************************************************/
diff --git a/intern/ghost/intern/GHOST_SystemCocoa.h b/intern/ghost/intern/GHOST_SystemCocoa.h
index d31df79bc2b..146fa841555 100644
--- a/intern/ghost/intern/GHOST_SystemCocoa.h
+++ b/intern/ghost/intern/GHOST_SystemCocoa.h
@@ -20,8 +20,8 @@
  *
  * The Original Code is: all of this file.
  *
- * Contributor(s):	Maarten Gribnau 05/2001
- *					Damien Plisson 09/2009
+ * Contributor(s): Maarten Gribnau 05/2001
+ *                 Damien Plisson 09/2009
  *
  * ***** END GPL LICENSE BLOCK *****
  */
diff --git a/intern/ghost/intern/GHOST_SystemCocoa.mm b/intern/ghost/intern/GHOST_SystemCocoa.mm
index 916da546669..697c0fae809 100644
--- a/intern/ghost/intern/GHOST_SystemCocoa.mm
+++ b/intern/ghost/intern/GHOST_SystemCocoa.mm
@@ -304,11 +304,13 @@ extern "C" int GHOST_HACK_getFirstFile(char buf[FIRSTFILEBUFLG])
 
 - (void)applicationDidFinishLaunching:(NSNotification *)aNotification
 {
-	// raise application to front, convenient when starting from the terminal
-	// and important for launching the animation player. we call this after the
-	// application finishes launching, as doing it earlier can make us end up
-	// with a frontmost window but an inactive application
-	[NSApp activateIgnoringOtherApps:YES];
+	if (systemCocoa->m_windowFocus) {
+		// Raise application to front, convenient when starting from the terminal
+		// and important for launching the animation player. we call this after the
+		// application finishes launching, as doing it earlier can make us end up
+		// with a frontmost window but an inactive application.
+		[NSApp activateIgnoringOtherApps:YES];
+	}
 }
 
 - (BOOL)application:(NSApplication *)theApplication openFile:(NSString *)filename
diff --git a/intern/ghost/intern/GHOST_SystemX11.h b/intern/ghost/intern/GHOST_SystemX11.h
index 3a65ff2a843..e9312ceb683 100644
--- a/intern/ghost/intern/GHOST_SystemX11.h
+++ b/intern/ghost/intern/GHOST_SystemX11.h
@@ -149,16 +149,16 @@ public:
 	 * Create a new window.
 	 * The new window is added to the list of windows managed.
 	 * Never explicitly delete the window, use disposeWindow() instead.
-	 * \param	title	The name of the window (displayed in the title bar of the window if the OS supports it).
-	 * \param	left		The coordinate of the left edge of the window.
-	 * \param	top		The coordinate of the top edge of the window.
-	 * \param	width		The width the window.
-	 * \param	height		The height the window.
-	 * \param	state		The state of the window when opened.
-	 * \param	type		The type of drawing context installed in this window.
-	 * \param	stereoVisual    Create a stereo visual for quad buffered stereo.
-	 * \param	exclusive	Use to show the window ontop and ignore others
-	 *						(used fullscreen).
+	 * \param   title   The name of the window (displayed in the title bar of the window if the OS supports it).
+	 * \param   left        The coordinate of the left edge of the window.
+	 * \param   top     The coordinate of the top edge of the window.
+	 * \param   width       The width the window.
+	 * \param   height      The height the window.
+	 * \param   state       The state of the window when opened.
+	 * \param   type        The type of drawing context installed in this window.
+	 * \param   stereoVisual    Create a stereo visual for quad buffered stereo.
+	 * \param   exclusive   Use to show the window ontop and ignore others
+	 *                      (used fullscreen).
 	 * \param	parentWindow    Parent (embedder) window
 	 * \return	The new window (or 0 if creation failed).
 	 */
diff --git a/intern/ghost/intern/GHOST_Window.h b/intern/ghost/intern/GHOST_Window.h
index 59d3123b9de..2fa30049eca 100644
--- a/intern/ghost/intern/GHOST_Window.h
+++ b/intern/ghost/intern/GHOST_Window.h
@@ -55,14 +55,14 @@ public:
 	 * Constructor.
 	 * Creates a new window and opens it.
 	 * To check if the window was created properly, use the getValid() method.
-	 * \param width				The width the window.
-	 * \param heigh				The height the window.
-	 * \param state				The state the window is initially opened with.
-	 * \param type				The type of drawing context installed in this window.
-	 * \param stereoVisual		Stereo visual for quad buffered stereo.
-	 * \param exclusive			Use to show the window ontop and ignore others
-	 *							(used fullscreen).
-	 * \param numOfAASamples	Number of samples used for AA (zero if no AA)
+	 * \param width             The width the window.
+	 * \param heigh             The height the window.
+	 * \param state             The state the window is initially opened with.
+	 * \param type              The type of drawing context installed in this window.
+	 * \param stereoVisual      Stereo visual for quad buffered stereo.
+	 * \param exclusive         Use to show the window ontop and ignore others
+	 *                          (used fullscreen).
+	 * \param numOfAASamples    Number of samples used for AA (zero if no AA)
 	 */
 	GHOST_Window(
 	    GHOST_TUns32 width,
diff --git a/intern/ghost/intern/GHOST_WindowCocoa.mm b/intern/ghost/intern/GHOST_WindowCocoa.mm
index 20060ac1267..79d2f304e60 100644
--- a/intern/ghost/intern/GHOST_WindowCocoa.mm
+++ b/intern/ghost/intern/GHOST_WindowCocoa.mm
@@ -551,7 +551,15 @@ GHOST_WindowCocoa::GHOST_WindowCocoa(
 	
 	//Creates the OpenGL View inside the window
 	m_openGLView = [[CocoaOpenGLView alloc] initWithFrame:rect];
-	
+
+	if (m_systemCocoa->m_nativePixel) {
+		// Needs to happen early when building with the 10.14 SDK, otherwise
+		// has no effect until resizeing the window.
+		if ([m_openGLView respondsToSelector:@selector(setWantsBestResolutionOpenGLSurface:)]) {
+			[m_openGLView setWantsBestResolutionOpenGLSurface:YES];
+		}
+	}
+
 	[m_openGLView setSystemAndWindowCocoa:systemCocoa windowCocoa:this];
 	
 	[m_window setContentView:m_openGLView];
@@ -563,14 +571,9 @@ GHOST_WindowCocoa::GHOST_WindowCocoa(
 	updateDrawingContext();
 	activateDrawingContext();
 
-	// XXX jwilkins: This seems like it belongs in GHOST_ContextCGL, but probably not GHOST_ContextEGL
 	if (m_systemCocoa->m_nativePixel) {
-		if ([m_openGLView respondsToSelector:@selector(setWantsBestResolutionOpenGLSurface:)]) {
-			[m_openGLView setWantsBestResolutionOpenGLSurface:YES];
-		
-			NSRect backingBounds = [m_openGLView convertRectToBacking:[m_openGLView bounds]];
-			m_nativePixelSize = (float)backingBounds.size.width / (float)rect.size.width;
-		}
+		NSRect backingBounds = [m_openGLView convertRectToBacking:[m_openGLView bounds]];
+		m_nativePixelSize = (float)backingBounds.size.width / (float)rect.size.width;
 	}
 	
 	setTitle(title);
diff --git a/intern/ghost/intern/GHOST_WindowWin32.cpp b/intern/ghost/intern/GHOST_WindowWin32.cpp
index 92de41a859b..983fffc10e6 100644
--- a/intern/ghost/intern/GHOST_WindowWin32.cpp
+++ b/intern/ghost/intern/GHOST_WindowWin32.cpp
@@ -201,6 +201,11 @@ GHOST_WindowWin32::GHOST_WindowWin32(GHOST_SystemWin32 *system,
 		// Store a pointer to this class in the window structure
 		::SetWindowLongPtr(m_hWnd, GWLP_USERDATA, (LONG_PTR) this);
 
+		if (!m_system->m_windowFocus) {
+			// Lower to bottom and don't activate if we don't want focus
+			::SetWindowPos(m_hWnd, HWND_BOTTOM, 0, 0, 0, 0, SWP_NOMOVE | SWP_NOSIZE | SWP_NOACTIVATE);
+		}
+
 		// Store the device context
 		m_hDC = ::GetDC(m_hWnd);
 
@@ -214,11 +219,11 @@ GHOST_WindowWin32::GHOST_WindowWin32(GHOST_SystemWin32 *system,
 					nCmdShow = SW_SHOWMAXIMIZED;
 					break;
 				case GHOST_kWindowStateMinimized:
-					nCmdShow = SW_SHOWMINIMIZED;
+					nCmdShow = (m_system->m_windowFocus) ? SW_SHOWMINIMIZED : SW_SHOWMINNOACTIVE;
 					break;
 				case GHOST_kWindowStateNormal:
 				default:
-					nCmdShow = SW_SHOWNORMAL;
+					nCmdShow = (m_system->m_windowFocus) ? SW_SHOWNORMAL : SW_SHOWNOACTIVATE;
 					break;
 			}
 
@@ -1105,12 +1110,12 @@ GHOST_TSuccess GHOST_WindowWin32::endProgressBar()
 #ifdef WITH_INPUT_IME
 void GHOST_WindowWin32::beginIME(GHOST_TInt32 x, GHOST_TInt32 y, GHOST_TInt32 w, GHOST_TInt32 h, int completed)
 {
-	m_imeImput.BeginIME(m_hWnd, GHOST_Rect(x, y - h, x, y), (bool)completed);
+	m_imeInput.BeginIME(m_hWnd, GHOST_Rect(x, y - h, x, y), (bool)completed);
 }
 
 
 void GHOST_WindowWin32::endIME()
 {
-	m_imeImput.EndIME(m_hWnd);
+	m_imeInput.EndIME(m_hWnd);
 }
 #endif /* WITH_INPUT_IME */
diff --git a/intern/ghost/intern/GHOST_WindowWin32.h b/intern/ghost/intern/GHOST_WindowWin32.h
index c72669ed898..8b0ba2f1934 100644
--- a/intern/ghost/intern/GHOST_WindowWin32.h
+++ b/intern/ghost/intern/GHOST_WindowWin32.h
@@ -225,10 +225,10 @@ public:
 	 * capturing).
 	 *
 	 * \param press
-	 *		0 - mouse pressed
-	 *		1 - mouse released
-	 *		2 - operator grab
-	 *		3 - operator ungrab
+	 *      0 - mouse pressed
+	 *      1 - mouse released
+	 *      2 - operator grab
+	 *      3 - operator ungrab
 	 */
 	void registerMouseClickEvent(int press);
 
@@ -265,7 +265,7 @@ public:
 	bool m_inLiveResize;
 
 #ifdef WITH_INPUT_IME
-	GHOST_ImeWin32 *getImeInput() {return &m_imeImput;}
+	GHOST_ImeWin32 *getImeInput() {return &m_imeInput;}
 
 	void beginIME(
 	        GHOST_TInt32 x, GHOST_TInt32 y,
@@ -369,7 +369,7 @@ private:
 
 #ifdef WITH_INPUT_IME
 	/** Handle input method editors event */
-	GHOST_ImeWin32 m_imeImput;
+	GHOST_ImeWin32 m_imeInput;
 #endif
 	bool m_debug_context;
 };
diff --git a/intern/ghost/intern/GHOST_WindowX11.cpp b/intern/ghost/intern/GHOST_WindowX11.cpp
index 623d57705b2..a4ccdef3788 100644
--- a/intern/ghost/intern/GHOST_WindowX11.cpp
+++ b/intern/ghost/intern/GHOST_WindowX11.cpp
@@ -517,7 +517,7 @@ GHOST_WindowX11(GHOST_SystemX11 *system,
 			natom++;
 		}
 
-		if (m_system->m_atom.WM_TAKE_FOCUS) {
+		if (m_system->m_atom.WM_TAKE_FOCUS && m_system->m_windowFocus) {
 			atoms[natom] = m_system->m_atom.WM_TAKE_FOCUS;
 			natom++;
 		}
@@ -532,7 +532,7 @@ GHOST_WindowX11(GHOST_SystemX11 *system,
 	{
 		XWMHints *xwmhints = XAllocWMHints();
 		xwmhints->initial_state = NormalState;
-		xwmhints->input = True;
+		xwmhints->input = (m_system->m_windowFocus) ? True : False;
 		xwmhints->flags = InputHint | StateHint;
 		XSetWMHints(display, m_window, xwmhints);
 		XFree(xwmhints);
@@ -586,11 +586,15 @@ GHOST_WindowX11(GHOST_SystemX11 *system,
 
 	setTitle(title);
 
-	if (exclusive) {
+	if (exclusive && system->m_windowFocus) {
 		XMapRaised(m_display, m_window);
 	}
 	else {
 		XMapWindow(m_display, m_window);
+
+		if (!system->m_windowFocus) {
+			XLowerWindow(m_display, m_window);
+		}
 	}
 	GHOST_PRINT("Mapped window\n");
 
diff --git a/intern/guardedalloc/CMakeLists.txt b/intern/guardedalloc/CMakeLists.txt
index 10ed4287185..3cec2fd1016 100644
--- a/intern/guardedalloc/CMakeLists.txt
+++ b/intern/guardedalloc/CMakeLists.txt
@@ -53,6 +53,11 @@ if(WIN32 AND NOT UNIX)
 	)
 endif()
 
+# Jemalloc 5.0.0+ needs extra configuration.
+if(WITH_MEM_JEMALLOC AND NOT ("${JEMALLOC_VERSION}" VERSION_LESS "5.0.0"))
+	add_definitions(-DWITH_JEMALLOC_CONF)
+endif()
+
 blender_add_lib(bf_intern_guardedalloc "${SRC}" "${INC}" "${INC_SYS}")
 
 # Override C++ alloc, optional.
diff --git a/intern/guardedalloc/intern/mallocn.c b/intern/guardedalloc/intern/mallocn.c
index a95cc9163c4..8c17da853e5 100644
--- a/intern/guardedalloc/intern/mallocn.c
+++ b/intern/guardedalloc/intern/mallocn.c
@@ -37,6 +37,13 @@
 
 #include "mallocn_intern.h"
 
+#ifdef WITH_JEMALLOC_CONF
+/* If jemalloc is used, it reads this global variable and enables background
+ * threads to purge dirty pages. Otherwise we release memory too slowly or not
+ * at all if the thread that did the allocation stays inactive. */
+const char *malloc_conf = "background_thread:true,dirty_decay_ms:4000";
+#endif
+
 size_t (*MEM_allocN_len)(const void *vmemh) = MEM_lockfree_allocN_len;
 void (*MEM_freeN)(void *vmemh) = MEM_lockfree_freeN;
 void *(*MEM_dupallocN)(const void *vmemh) = MEM_lockfree_dupallocN;
diff --git a/intern/locale/boost_locale_wrapper.cpp b/intern/locale/boost_locale_wrapper.cpp
index 0707c0dd3e3..3fd8f146aa3 100644
--- a/intern/locale/boost_locale_wrapper.cpp
+++ b/intern/locale/boost_locale_wrapper.cpp
@@ -112,13 +112,17 @@ const char *bl_locale_pgettext(const char *msgctxt, const char *msgid)
 			return r;
 		return msgid;
 	}
-	catch(std::bad_cast const &e) { /* if std::has_facet<char_message_facet>(l) == false, LC_ALL = "C" case */
-//		std::cout << "bl_locale_pgettext(" << msgid << "): " << e.what() << " \n";
+	catch(const std::bad_cast &e) { /* if std::has_facet<char_message_facet>(l) == false, LC_ALL = "C" case */
+#ifndef NDEBUG
+		std::cout << "bl_locale_pgettext(" << msgid << "): " << e.what() << " \n";
+#endif
 		(void)e;
 		return msgid;
 	}
-	catch(std::exception const &e) {
-//		std::cout << "bl_locale_pgettext(" << msgctxt << ", " << msgid << "): " << e.what() << " \n";
+	catch(const std::exception &e) {
+#ifndef NDEBUG
+		std::cout << "bl_locale_pgettext(" << msgctxt << ", " << msgid << "): " << e.what() << " \n";
+#endif
 		(void)e;
 		return msgid;
 	}
diff --git a/intern/smoke/intern/FLUID_3D.cpp b/intern/smoke/intern/FLUID_3D.cpp
index 8a27818ff36..fd0a7e2005f 100644
--- a/intern/smoke/intern/FLUID_3D.cpp
+++ b/intern/smoke/intern/FLUID_3D.cpp
@@ -38,7 +38,7 @@
 
 #if PARALLEL==1
 #include <omp.h>
-#endif // PARALLEL 
+#endif // PARALLEL
 
 //////////////////////////////////////////////////////////////////////
 // Construction/Destruction
@@ -51,13 +51,13 @@ FLUID_3D::FLUID_3D(int *res, float dx, float dtdef, int init_heat, int init_fire
 	_dt = dtdef;	// just in case. set in step from a RNA factor
 
 	_iterations = 100;
-	_tempAmb = 0; 
+	_tempAmb = 0;
 	_heatDiffusion = 1e-3;
 	_totalTime = 0.0f;
 	_totalSteps = 0;
 	_res = Vec3Int(_xRes,_yRes,_zRes);
 	_maxRes = MAX3(_xRes, _yRes, _zRes);
-	
+
 	// initialize wavelet turbulence
 	/*
 	if(amplify)
@@ -65,7 +65,7 @@ FLUID_3D::FLUID_3D(int *res, float dx, float dtdef, int init_heat, int init_fire
 	else
 		_wTurbulence = NULL;
 	*/
-	
+
 	// scale the constants according to the refinement of the grid
 	if (!dx)
 		_dx = 1.0f / (float)_maxRes;
@@ -218,7 +218,7 @@ void FLUID_3D::initColors(float init_r, float init_g, float init_b)
 
 void FLUID_3D::setBorderObstacles()
 {
-	
+
 	// set side obstacles
 	unsigned int index;
 	for (int y = 0; y < _yRes; y++)
@@ -331,7 +331,7 @@ void FLUID_3D::step(float dt, float gravity[3])
 	// If border rules have been changed
 	if (_colloPrev != *_borderColli) {
 		printf("Border collisions changed\n");
-		
+
 		// DG TODO: Need to check that no animated obstacle flags are overwritten
 		setBorderCollisions();
 	}
@@ -490,7 +490,7 @@ void FLUID_3D::step(float dt, float gravity[3])
 	for (int i=1; i<stepParts; i++)
 	{
 		int zPos=(int)((float)i*partSize + 0.5f);
-		
+
 		artificialDampingExactSL(zPos);
 
 	}
@@ -620,7 +620,7 @@ void FLUID_3D::artificialDampingSL(int zBegin, int zEnd) {
 void FLUID_3D::artificialDampingExactSL(int pos) {
 	const float w = 0.9;
 	int index, x,y,z;
-	
+
 
 	size_t posslab;
 
@@ -650,7 +650,7 @@ void FLUID_3D::artificialDampingExactSL(int pos) {
 							_zVelocityTemp[index+1] + _zVelocityTemp[index-1] +
 							_zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] +
 							_zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] );
-					
+
 				}
 	}
 
@@ -677,7 +677,7 @@ void FLUID_3D::artificialDampingExactSL(int pos) {
 							_zVelocityTemp[index+1] + _zVelocityTemp[index-1] +
 							_zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] +
 							_zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] );
-					
+
 				}
 
 	}
@@ -759,7 +759,7 @@ void FLUID_3D::wipeBoundaries(int zBegin, int zEnd)
 
 void FLUID_3D::wipeBoundariesSL(int zBegin, int zEnd)
 {
-	
+
 	/////////////////////////////////////
 	// setZeroBorder to all:
 	/////////////////////////////////////
@@ -933,16 +933,16 @@ void FLUID_3D::project()
 
 	memset(_pressure, 0, sizeof(float)*_totalCells);
 	memset(_divergence, 0, sizeof(float)*_totalCells);
-	
+
 	// set velocity and pressure inside of obstacles to zero
 	setObstacleBoundaries(_pressure, 0, _zRes);
-	
+
 	// copy out the boundaries
 	if(!_domainBcLeft)  setNeumannX(_xVelocity, _res, 0, _zRes);
-	else setZeroX(_xVelocity, _res, 0, _zRes); 
+	else setZeroX(_xVelocity, _res, 0, _zRes);
 
 	if(!_domainBcFront)   setNeumannY(_yVelocity, _res, 0, _zRes);
-	else setZeroY(_yVelocity, _res, 0, _zRes); 
+	else setZeroY(_yVelocity, _res, 0, _zRes);
 
 	if(!_domainBcTop) setNeumannZ(_zVelocity, _res, 0, _zRes);
 	else setZeroZ(_zVelocity, _res, 0, _zRes);
@@ -953,13 +953,13 @@ void FLUID_3D::project()
 		for (y = 1; y < _yRes - 1; y++, index += 2)
 			for (x = 1; x < _xRes - 1; x++, index++)
 			{
-				
+
 				if(_obstacles[index])
 				{
 					_divergence[index] = 0.0f;
 					continue;
 				}
-				
+
 
 				float xright = _xVelocity[index + 1];
 				float xleft  = _xVelocity[index - 1];
@@ -1058,7 +1058,7 @@ void FLUID_3D::project()
 //////////////////////////////////////////////////////////////////////
 void FLUID_3D::setObstacleVelocity(int zBegin, int zEnd)
 {
-	
+
 	// completely TODO <-- who wrote this and what is here TODO? DG
 
 	const size_t index_ = _slabSize + _xRes + 1;
@@ -1082,7 +1082,7 @@ void FLUID_3D::setObstacleVelocity(int zBegin, int zEnd)
 		{
 			if (!_obstacles[index])
 			{
-				// if(_obstacles[index+1]) xright = - _xVelocityOb[index]; 
+				// if(_obstacles[index+1]) xright = - _xVelocityOb[index];
 				if((_obstacles[index - 1] & 8) && abs(_xVelocityOb[index - 1]) > FLT_EPSILON )
 				{
 					// printf("velocity x!\n");
@@ -1221,7 +1221,7 @@ void FLUID_3D::setObstaclePressure(float *_pressure, int zBegin, int zEnd)
 					_pressure[index] += _pressure[index + _slabSize];
 					pcnt += 1.0f;
 				}
-				
+
 				if(pcnt > 0.000001f)
 				 	_pressure[index] /= pcnt;
 
@@ -1254,7 +1254,7 @@ void FLUID_3D::setObstacleBoundaries(float *_pressure, int zBegin, int zEnd)
 	for (int z = zBegin + bb; z < zEnd - bt; z++)
 	{
 		size_t index = index_ +(z-1)*_slabSize;
-		
+
 		for (int y = 1; y < _yRes - 1; y++, index += 2)
 		{
 			for (int x = 1; x < _xRes - 1; x++, index++)
@@ -1563,7 +1563,7 @@ void FLUID_3D::addVorticity(int zBegin, int zEnd)
 
 	// calculate normalized vorticity vectors
 	float eps = _vorticityEps;
-	
+
 	//index = _slabSize + _xRes + 1;
 	vIndex=_slabSize + _xRes + 1;
 
@@ -1618,7 +1618,7 @@ void FLUID_3D::addVorticity(int zBegin, int zEnd)
 				}		// y loop
 			//vIndex+=2*_xRes;
 		}				// z loop
-				
+
 	if (_xVorticity) delete[] _xVorticity;
 	if (_yVorticity) delete[] _yVorticity;
 	if (_zVorticity) delete[] _zVorticity;
@@ -1704,10 +1704,10 @@ void FLUID_3D::advectMacCormackEnd2(int zBegin, int zEnd)
 
 	/* set boundary conditions for velocity */
 	if(!_domainBcLeft) copyBorderX(_xVelocityTemp, res, zBegin, zEnd);
-	else setZeroX(_xVelocityTemp, res, zBegin, zEnd);				
+	else setZeroX(_xVelocityTemp, res, zBegin, zEnd);
 
 	if(!_domainBcFront) copyBorderY(_yVelocityTemp, res, zBegin, zEnd);
-	else setZeroY(_yVelocityTemp, res, zBegin, zEnd); 
+	else setZeroY(_yVelocityTemp, res, zBegin, zEnd);
 
 	if(!_domainBcTop) copyBorderZ(_zVelocityTemp, res, zBegin, zEnd);
 	else setZeroZ(_zVelocityTemp, res, zBegin, zEnd);
@@ -1778,9 +1778,9 @@ void FLUID_3D::updateFlame(float *react, float *flame, int total_cells)
 	for (int index = 0; index < total_cells; index++)
 	{
 		/* model flame temperature curve from the reaction coordinate (fuel)
-		 *	TODO: Would probably be best to get rid of whole "flame" data field.
-		 *		 Currently it's just sqrt mirror of reaction coordinate, and therefore
-		 *		 basically just waste of memory and disk space...
+		 *  TODO: Would probably be best to get rid of whole "flame" data field.
+		 *        Currently it's just sqrt mirror of reaction coordinate, and therefore
+		 *        basically just waste of memory and disk space...
 		 */
 		if (react[index]>0.0f) {
 			/* do a smooth falloff for rest of the values */