Cycles: Improved robustness of hair motion blur.motion_curve_fix

In some instances, the number of control vertices of a hair could change mid-frame. Cycles would then be unable to calculate proper motion blur for those hairs. This adds interpolated CVs to fill in for the missing data. While this will not necessarily result in a fully accurate reconstruction of the guide hair, it preserves motion blur instead of disabling it. Reviewers: #cycles, sergey Reviewed By: #cycles, sergey Subscribers: sergey, brecht, #cycles Tags: #cycles Differential Revision: https://developer.blender.org/D3695
author: Stefan Werner <stefan.werner@tangent-animation.com> 2018-11-23 15:08:15 +0300
committer: Stefan Werner <stefan.werner@tangent-animation.com> 2018-11-23 15:19:53 +0300
commit: 071f4f4ce0b9520ab0c73d6d68365ad449ca8b80 (patch)
tree: 9f37bfcac669366b9ad5fb7605f2fbbed9b71b0a /intern/cycles/kernel
parent: 0a2b2d59a5897212ba3771503feb6770fb636bc8 (diff)
89 files changed, 1106 insertions, 567 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index c6e92c6d89d..92cb66bdec9 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -82,6 +82,7 @@ set(SRC_BVH_HEADERS
 	bvh/obvh_traversal.h
 	bvh/obvh_volume.h
 	bvh/obvh_volume_all.h
+	bvh/bvh_embree.h
 )
 
 set(SRC_HEADERS
@@ -96,6 +97,7 @@ set(SRC_HEADERS
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
+	kernel_id_passes.h
 	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
@@ -340,11 +342,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90")
+	if(CUDA_VERSION MATCHES "90" OR CUDA_VERSION MATCHES "91")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-			"build may succeed but only CUDA 8.0 is officially supported")
+			"build may succeed but only CUDA 9.0 and 9.1 are officially supported")
 	endif()
 
 	# build for each arch
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 2ad55d041bf..6708a3efac1 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,10 @@
  * the code has been extended and modified to support more primitives and work
  * with CPU/CUDA/OpenCL. */
 
+#ifdef __EMBREE__
+#  include "kernel/bvh/bvh_embree.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 #include "kernel/bvh/bvh_types.h"
@@ -32,9 +36,9 @@ CCL_NAMESPACE_BEGIN
 /* Common QBVH functions. */
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_nodes.h"
-#ifdef __KERNEL_AVX2__
-#  include "kernel/bvh/obvh_nodes.h"
-#endif
+#  ifdef __KERNEL_AVX2__
+#    include "kernel/bvh/obvh_nodes.h"
+#  endif
 #endif
 
 /* Regular BVH traversal */
@@ -160,6 +164,19 @@ CCL_NAMESPACE_BEGIN
 #undef BVH_NAME_EVAL
 #undef BVH_FUNCTION_FULL_NAME
 
+ccl_device_inline bool scene_intersect_valid(const Ray *ray)
+{
+	/* NOTE: Due to some vectorization code  non-finite origin point might
+	 * cause lots of false-positive intersections which will overflow traversal
+	 * stack.
+	 * This code is a quick way to perform early output, to avoid crashes in
+	 * such cases.
+	 * From production scenes so far it seems it's enough to test first element
+	 * only.
+	 */
+	return isfinite(ray->P.x);
+}
+
 /* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           const Ray ray,
@@ -169,39 +186,57 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           float difl,
                                           float extmax)
 {
+	if(!scene_intersect_valid(&ray)) {
+		return false;
+	}
+#ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		isect->t = ray.t;
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
+		IntersectContext rtc_ctx(&ctx);
+		RTCRayHit ray_hit;
+		kernel_embree_setup_rayhit(ray, ray_hit, visibility);
+		rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit);
+		if(ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID && ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) {
+			kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
+			return true;
+		}
+		return false;
+	}
+#endif  /* __EMBREE__ */
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #  ifdef __HAIR__
 		if(kernel_data.bvh.have_curves)
 			return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
-#  endif /* __HAIR__ */
+#  endif  /* __HAIR__ */
 
 		return bvh_intersect_motion(kg, &ray, isect, visibility);
 	}
-#endif /* __OBJECT_MOTION__ */
+#endif  /* __OBJECT_MOTION__ */
 
 #ifdef __HAIR__
 	if(kernel_data.bvh.have_curves)
 		return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_CPU__
 
 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_instancing(kg, &ray, isect, visibility);
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 
 	return bvh_intersect(kg, &ray, isect, visibility);
-#else /* __KERNEL_CPU__ */
+#else  /* __KERNEL_CPU__ */
 
 #  ifdef __INSTANCING__
 	return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  else
 	return bvh_intersect(kg, &ray, isect, visibility);
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 
-#endif /* __KERNEL_CPU__ */
+#endif  /* __KERNEL_CPU__ */
 }
 
 #ifdef __BVH_LOCAL__
@@ -213,6 +248,58 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
+	if(!scene_intersect_valid(&ray)) {
+		return false;
+	}
+#ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS);
+		ctx.lcg_state = lcg_state;
+		ctx.max_hits = max_hits;
+		ctx.ss_isect = local_isect;
+		local_isect->num_hits = 0;
+		ctx.sss_object_id = local_object;
+		IntersectContext rtc_ctx(&ctx);
+		RTCRay rtc_ray;
+		kernel_embree_setup_ray(ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
+
+		/* Get the Embree scene for this intersection. */
+		RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2);
+		if(geom) {
+			float3 P = ray.P;
+			float3 dir = ray.D;
+			float3 idir = ray.D;
+			const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+			if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				Transform ob_itfm;
+				rtc_ray.tfar = bvh_instance_motion_push(kg,
+				                                        local_object,
+				                                        &ray,
+				                                        &P,
+				                                        &dir,
+				                                        &idir,
+				                                        ray.t,
+				                                        &ob_itfm);
+				/* bvh_instance_motion_push() returns the inverse transform but
+				 * it's not needed here. */
+				(void) ob_itfm;
+
+				rtc_ray.org_x = P.x;
+				rtc_ray.org_y = P.y;
+				rtc_ray.org_z = P.z;
+				rtc_ray.dir_x = dir.x;
+				rtc_ray.dir_y = dir.y;
+				rtc_ray.dir_z = dir.z;
+			}
+			RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
+			if(scene) {
+				rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
+			}
+		}
+
+		return local_isect->num_hits > 0;
+	}
+#endif  /* __EMBREE__ */
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_local_motion(kg,
@@ -222,7 +309,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 		                                  lcg_state,
 		                                  max_hits);
 	}
-#endif /* __OBJECT_MOTION__ */
+#endif  /* __OBJECT_MOTION__ */
 	return bvh_intersect_local(kg,
 	                            &ray,
 	                            local_isect,
@@ -240,6 +327,27 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
+	if(!scene_intersect_valid(ray)) {
+		return false;
+	}
+#  ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
+		ctx.isect_s = isect;
+		ctx.max_hits = max_hits;
+		ctx.num_hits = 0;
+		IntersectContext rtc_ctx(&ctx);
+		RTCRay rtc_ray;
+		kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW);
+		rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+
+		if(ctx.num_hits > max_hits) {
+			return true;
+		}
+		*num_hits = ctx.num_hits;
+		return rtc_ray.tfar == -INFINITY;
+	}
+#  endif
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
@@ -251,7 +359,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 			                                            max_hits,
 			                                            num_hits);
 		}
-#    endif /* __HAIR__ */
+#    endif  /* __HAIR__ */
 
 		return bvh_intersect_shadow_all_motion(kg,
 		                                       ray,
@@ -260,7 +368,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		                                       max_hits,
 		                                       num_hits);
 	}
-#  endif /* __OBJECT_MOTION__ */
+#  endif  /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
 	if(kernel_data.bvh.have_curves) {
@@ -271,7 +379,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		                                     max_hits,
 		                                     num_hits);
 	}
-#  endif /* __HAIR__ */
+#  endif  /* __HAIR__ */
 
 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing) {
@@ -282,7 +390,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		                                           max_hits,
 		                                           num_hits);
 	}
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 
 	return bvh_intersect_shadow_all(kg,
 	                                ray,
@@ -299,24 +407,27 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
+	if(!scene_intersect_valid(ray)) {
+		return false;
+	}
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_volume_motion(kg, ray, isect, visibility);
 	}
-#  endif /* __OBJECT_MOTION__ */
+#  endif  /* __OBJECT_MOTION__ */
 #  ifdef __KERNEL_CPU__
 #    ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
+#    endif  /* __INSTANCING__ */
 	return bvh_intersect_volume(kg, ray, isect, visibility);
-#  else /* __KERNEL_CPU__ */
+#  else  /* __KERNEL_CPU__ */
 #    ifdef __INSTANCING__
 	return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
 #    else
 	return bvh_intersect_volume(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-#  endif /* __KERNEL_CPU__ */
+#    endif  /* __INSTANCING__ */
+#  endif  /* __KERNEL_CPU__ */
 }
 #endif  /* __VOLUME__ */
 
@@ -327,15 +438,31 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
+	if(!scene_intersect_valid(ray)) {
+		return false;
+	}
+#  ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
+		ctx.isect_s = isect;
+		ctx.max_hits = max_hits;
+		ctx.num_hits = 0;
+		IntersectContext rtc_ctx(&ctx);
+		RTCRay rtc_ray;
+		kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+		rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+		return rtc_ray.tfar == -INFINITY;
+	}
+#  endif
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
 	}
-#  endif /* __OBJECT_MOTION__ */
+#  endif  /* __OBJECT_MOTION__ */
 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
-#  endif /* __INSTANCING__ */
+#  endif  /* __INSTANCING__ */
 	return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
 }
 #endif  /* __VOLUME_RECORD_ALL__ */
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
new file mode 100644
index 00000000000..34a099ebb4d
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2018, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <embree3/rtcore_ray.h>
+#include <embree3/rtcore_scene.h>
+
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct CCLIntersectContext  {
+	typedef enum {
+		RAY_REGULAR = 0,
+		RAY_SHADOW_ALL = 1,
+		RAY_SSS = 2,
+		RAY_VOLUME_ALL = 3,
+		
+	} RayType;
+
+	KernelGlobals *kg;
+	RayType type;
+
+	/* for shadow rays */
+	Intersection *isect_s;
+	int max_hits;
+	int num_hits;
+
+	/* for SSS Rays: */
+	LocalIntersection *ss_isect;
+	int sss_object_id;
+	uint *lcg_state;
+
+	CCLIntersectContext(KernelGlobals *kg_,  RayType type_)
+	{
+		kg = kg_;
+		type = type_;
+		max_hits = 1;
+		num_hits = 0;
+		isect_s = NULL;
+		ss_isect = NULL;
+		sss_object_id = -1;
+		lcg_state = NULL;
+	}
+};
+
+class IntersectContext
+{
+public:
+	IntersectContext(CCLIntersectContext* ctx)
+	{
+		rtcInitIntersectContext(&context);
+		userRayExt = ctx;
+	}
+	RTCIntersectContext context;
+	CCLIntersectContext* userRayExt;
+};
+
+ccl_device_inline void kernel_embree_setup_ray(const Ray& ray, RTCRay& rtc_ray, const uint visibility)
+{
+	rtc_ray.org_x = ray.P.x;
+	rtc_ray.org_y = ray.P.y;
+	rtc_ray.org_z = ray.P.z;
+	rtc_ray.dir_x = ray.D.x;
+	rtc_ray.dir_y = ray.D.y;
+	rtc_ray.dir_z = ray.D.z;
+	rtc_ray.tnear = 0.0f;
+	rtc_ray.tfar = ray.t;
+	rtc_ray.time = ray.time;
+	rtc_ray.mask = visibility;
+}
+
+ccl_device_inline void kernel_embree_setup_rayhit(const Ray& ray, RTCRayHit& rayhit, const uint visibility)
+{
+	kernel_embree_setup_ray(ray, rayhit.ray, visibility);
+	rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+	rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
+}
+
+ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect)
+{
+	bool is_hair = hit->geomID & 1;
+	isect->u = is_hair ? hit->u : 1.0f - hit->v - hit->u;
+	isect->v = is_hair ? hit->v : hit->u;
+	isect->t = ray->tfar;
+	isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z);
+	if(hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+		RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
+		isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, hit->instID[0]/2);
+		isect->object = hit->instID[0]/2;
+	}
+	else {
+		isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
+		isect->object = OBJECT_NONE;
+	}
+	isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+}
+
+ccl_device_inline void kernel_embree_convert_local_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int local_object_id)
+{
+	isect->u = 1.0f - hit->v - hit->u;
+	isect->v = hit->u;
+	isect->t = ray->tfar;
+	isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z);
+	RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, local_object_id * 2));
+	isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, local_object_id);
+	isect->object = local_object_id;
+	isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 2b02f4527bb..8364bc3aa9a 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -136,7 +136,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               PATH_RAY_ALL_VISIBILITY,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -151,7 +151,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               PATH_RAY_ALL_VISIBILITY,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index d525b29fd94..64eb2f3f659 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -124,7 +124,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -139,7 +139,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index e95d2408201..af9f04db0ba 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -146,7 +146,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					                               visibility,
 					                               dist);
 				}
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 #  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
 					traverse_mask = NODE_INTERSECT_ROBUST(kg,
@@ -184,7 +184,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					                               visibility,
 					                               dist);
 				}
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 7d03855cb8f..12d4c5eb94a 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -120,7 +120,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -135,7 +135,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 3d9b598914f..6205b9bcf7a 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -124,7 +124,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#else // __KERNEL_SSE2__
+#else  // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
 				                               P,
 				                               dir,
@@ -139,7 +139,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               node_addr,
 				                               visibility,
 				                               dist);
-#endif // __KERNEL_SSE2__
+#endif  // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
 				node_addr_child1 = __float_as_int(cnodes.w);
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
index 92143193a6a..eb24a607caa 100644
--- a/intern/cycles/kernel/bvh/obvh_local.h
+++ b/intern/cycles/kernel/bvh/obvh_local.h
@@ -73,12 +73,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 		object = local_object;
 	}
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	avxf tnear(0.0f), tfar(isect_t);
 #if BVH_FEATURE(BVH_HAIR)
 	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
index 3e877065127..8b739b3438a 100644
--- a/intern/cycles/kernel/bvh/obvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h
@@ -66,12 +66,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	*num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
@@ -103,7 +97,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
index 2021d8e1143..6bb19eb1ed9 100644
--- a/intern/cycles/kernel/bvh/obvh_traversal.h
+++ b/intern/cycles/kernel/bvh/obvh_traversal.h
@@ -64,12 +64,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
@@ -103,7 +97,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -179,7 +173,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 					avxf cnodes;
 					/* TODO(sergey): Investigate whether moving cnodes upwards
 					 * gives a speedup (will be different cache pattern but will
-					 * avoid extra check here),
+					 * avoid extra check here).
 					 */
 #if BVH_FEATURE(BVH_HAIR)
 					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
index da9ddbd4f24..80d09c59039 100644
--- a/intern/cycles/kernel/bvh/obvh_volume.h
+++ b/intern/cycles/kernel/bvh/obvh_volume.h
@@ -52,12 +52,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
index a88573e6f86..87216127ddb 100644
--- a/intern/cycles/kernel/bvh/obvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/obvh_volume_all.h
@@ -58,12 +58,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
 	uint num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return 0;
-	}
-#endif
-
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h
index ee3827de309..22d434a8737 100644
--- a/intern/cycles/kernel/bvh/qbvh_local.h
+++ b/intern/cycles/kernel/bvh/qbvh_local.h
@@ -82,12 +82,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		object = local_object;
 	}
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	ssef tnear(0.0f), tfar(isect_t);
 #if BVH_FEATURE(BVH_HAIR)
 	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 46fd178aed6..37606e10b92 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -66,11 +66,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	*num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
 
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
@@ -103,7 +98,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index 335a4afd47a..35c6e3aeec9 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -71,12 +71,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
@@ -112,7 +106,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-				(void)inodes;
+				(void) inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -188,7 +182,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					float4 cnodes;
 					/* TODO(sergey): Investigate whether moving cnodes upwards
 					 * gives a speedup (will be different cache pattern but will
-					 * avoid extra check here),
+					 * avoid extra check here).
 					 */
 #if BVH_FEATURE(BVH_HAIR)
 					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index 192ce009524..7ec264e5f78 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -58,12 +58,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	Transform ob_itfm;
 #endif
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
 	isect->t = ray->t;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index 1e454e4d36b..dd603d79334 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -64,12 +64,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	uint num_hits = 0;
 	isect_array->t = tmax;
 
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return 0;
-	}
-#endif
-
 #if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index ff238b7a834..4e7425bd800 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -232,4 +232,4 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
+#endif  /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index b0bdea723b9..80fd9ba2b37 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -158,4 +158,4 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
+#endif  /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ee6d4cdf2df..946c460a70e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -139,4 +139,4 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_DIFFUSE_H__ */
+#endif  /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 35bb2fdf0e8..ca33a5b275c 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -103,8 +103,8 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, float3 Ng, floa
 	return LABEL_REFLECT|LABEL_DIFFUSE;
 }
 
-#endif /* __OSL__ */
+#endif  /* __OSL__ */
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
+#endif  /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7b44a23f05b..e1a0cfaa3f5 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -277,4 +277,4 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_HAIR_H__ */
+#endif  /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index b3b56be39ff..68335ee887a 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -229,7 +229,7 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
-#endif /* __HAIR__ */
+#endif  /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
 ccl_device_inline void hair_attenuation(KernelGlobals *kg,
@@ -296,7 +296,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
 	float3 Y = float4_to_float3(bsdf->extra->geom);
 
 	float3 X = safe_normalize(sd->dPdu);
-	kernel_assert(fabsf(dot(X, Y)) < 1e-4f);
+	kernel_assert(fabsf(dot(X, Y)) < 1e-3f);
 	float3 Z = safe_normalize(cross(X, Y));
 
 	float3 wo = make_float3(dot(sd->I, X), dot(sd->I, Y), dot(sd->I, Z));
@@ -378,7 +378,7 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
 	float3 Y = float4_to_float3(bsdf->extra->geom);
 
 	float3 X = safe_normalize(sd->dPdu);
-	kernel_assert(fabsf(dot(X, Y)) < 1e-4f);
+	kernel_assert(fabsf(dot(X, Y)) < 1e-3f);
 	float3 Z = safe_normalize(cross(X, Y));
 
 	float3 wo = make_float3(dot(sd->I, X), dot(sd->I, Y), dot(sd->I, Z));
@@ -499,4 +499,4 @@ ccl_device void bsdf_principled_hair_blur(ShaderClosure *sc, float roughness)
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
+#endif  /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index e74d5ebaa42..32b6e50b09a 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -1124,4 +1124,4 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_MICROFACET_H__ */
+#endif  /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index e73915dbda7..5d300ef6db5 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -76,7 +76,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 		eval *= -lambda_r / (shadowing_lambda - lambda_r);
 	else
 		eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 	const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
 	float val = G2 * 0.25f / wi.z;
 	if(alpha.x == alpha.y)
@@ -129,7 +129,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 				phase = mf_eval_phase_glass(wr, lambda_r,  wo,  wo_outside, alpha, eta);
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
 #endif
 			eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
@@ -153,7 +153,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			else if(use_fresnel && order > 0) {
 				throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
 			}
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 			if(use_fresnel && order > 0) {
 				throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
 			}
@@ -248,7 +248,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
 					throughput *= t_color;
 			}
 		}
-#else /* MF_MULTI_GLOSSY */
+#else  /* MF_MULTI_GLOSSY */
 		if(use_fresnel) {
 			float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
 
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 6b770fc0c16..3446d1609d9 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -108,4 +108,4 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_OREN_NAYAR_H__ */
+#endif  /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 91c7803346d..83da05ac435 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -135,8 +135,8 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, float3 Ng, float3
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
-#endif /* __OSL__ */
+#endif  /* __OSL__ */
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_PHONG_RAMP_H__ */
+#endif  /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 83be2b35a00..2f65fd54be2 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -122,4 +122,4 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+#endif  /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 8b7c4399516..ccdcb1babd2 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -108,4 +108,4 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+#endif  /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index b33b6e3597b..94f1c283af7 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -77,4 +77,4 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_REFLECTION_H__ */
+#endif  /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index b181650e928..abdd01c7a1d 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -86,4 +86,4 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_REFRACTION_H__ */
+#endif  /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 6d8074b7130..097a56f22eb 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -215,4 +215,4 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_TOON_H__ */
+#endif  /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index f788dbcd0ff..060dff69f52 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -106,4 +106,4 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc, float3 Ng, float
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_TRANSPARENT_H__ */
+#endif  /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b080e025d16..4f3453675c7 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -158,4 +158,4 @@ ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, floa
 
 CCL_NAMESPACE_END
 
-#endif /* __BSDF_UTIL_H__ */
+#endif  /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index ba0c6ae8c61..98c7f23c288 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -499,4 +499,4 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_BSSRDF_H__ */
+#endif  /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
index f6e474d6702..4209d69ee73 100644
--- a/intern/cycles/kernel/filter/filter.h
+++ b/intern/cycles/kernel/filter/filter.h
@@ -49,4 +49,4 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#endif /* __FILTER_H__ */
+#endif  /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index 1a2f22a6987..67f4e62ac0f 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -68,4 +68,4 @@ typedef struct TileInfo {
 #  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
 #endif
 
-#endif /* __FILTER_DEFINES_H__*/
+#endif  /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index e2da0fd872b..af73c0dadf2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -16,6 +16,9 @@
 
 CCL_NAMESPACE_BEGIN
 
+#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
+#define load4_u(buf, ofs) load_float4((buf)+(ofs))
+
 ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          const float *ccl_restrict weight_image,
                                                          const float *ccl_restrict variance_image,
@@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          float a,
                                                          float k_2)
 {
+	/* Strides need to be aligned to 16 bytes. */
+	kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
+
+	int aligned_lowx = rect.x & (~3);
+	const int numChannels = (channel_offset > 0)? 3 : 1;
+	const float4 channel_fac = make_float4(1.0f / numChannels);
+
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			float diff = 0.0f;
-			int numChannels = channel_offset? 3 : 1;
-			for(int c = 0; c < numChannels; c++) {
-				float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-				float pvar = variance_image[c*channel_offset + y*stride + x];
-				float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
-			}
-			if(numChannels > 1) {
-				diff *= 1.0f/numChannels;
+		int idx_p = y*stride + aligned_lowx;
+		int idx_q = (y+dy)*stride + aligned_lowx + dx;
+		for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
+			float4 diff = make_float4(0.0f);
+			for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
+				/* idx_p is guaranteed to be aligned, but idx_q isn't. */
+				float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
+				float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+				float4 cdiff = color_p - color_q;
+				float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
+				float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+				diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
 			}
-			difference_image[y*stride + x] = diff;
+			load4_a(difference_image, idx_p) = diff*channel_fac;
 		}
 	}
 }
@@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int stride,
                                               int f)
 {
-	int aligned_lowx = rect.x / 4;
-	int aligned_highx = (rect.z + 3) / 4;
+	int aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
 		const int low = max(rect.y, y-f);
 		const int high = min(rect.w, y+f+1);
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*stride + x] = 0.0f;
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) = make_float4(0.0f);
 		}
 		for(int y1 = low; y1 < high; y1++) {
-			float4* out_image4 = (float4*)(out_image + y*stride);
-			float4* difference_image4 = (float4*)(difference_image + y1*stride);
-			for(int x = aligned_lowx; x < aligned_highx; x++) {
-				out_image4[x] += difference_image4[x];
+			for(int x = aligned_lowx; x < rect.z; x += 4) {
+				load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
 			}
 		}
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*stride + x] *= 1.0f/(high - low);
+		float fac = 1.0f/(high - low);
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) *= fac;
 		}
 	}
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                                     float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
+ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
+                                           float *out_image,
+                                           int4 rect,
+                                           int stride,
+                                           int f)
 {
+	int aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			out_image[y*stride + x] = 0.0f;
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) = make_float4(0.0f);
 		}
 	}
+
 	for(int dx = -f; dx <= f; dx++) {
-		int pos_dx = max(0, dx);
-		int neg_dx = min(0, dx);
+		aligned_lowx = round_down(rect.x - min(0, dx), 4);
+		int highx = rect.z - max(0, dx);
+		int4 lowx4 = make_int4(rect.x - min(0, dx));
+		int4 highx4 = make_int4(rect.z - max(0, dx));
 		for(int y = rect.y; y < rect.w; y++) {
-			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
-				out_image[y*stride + x] += difference_image[y*stride + x+dx];
+			for(int x = aligned_lowx; x < highx; x += 4) {
+				int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+				int4 active = (x4 >= lowx4) & (x4 < highx4);
+
+				float4 diff = load4_u(difference_image, y*stride + x + dx);
+				load4_a(out_image, y*stride + x) += mask(active, diff);
 			}
 		}
 	}
+
+	aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			const int low = max(rect.x, x-f);
-			const int high = min(rect.z, x+f+1);
-			out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
+			float4 low = max(make_float4(rect.x), x4 - make_float4(f));
+			float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
+			load4_a(out_image, y*stride + x) *= rcp(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int stride,
+                                                     int f)
+{
+	nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
+
+	int aligned_lowx = round_down(rect.x, 4);
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
 		}
 	}
 }
@@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
 ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        const float *ccl_restrict difference_image,
                                                        const float *ccl_restrict image,
+                                                       float *temp_image,
                                                        float *out_image,
                                                        float *accum_image,
                                                        int4 rect,
                                                        int stride,
                                                        int f)
 {
+	nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
+
+	int aligned_lowx = round_down(rect.x, 4);
 	for(int y = rect.y; y < rect.w; y++) {
-		for(int x = rect.x; x < rect.z; x++) {
-			const int low = max(rect.x, x-f);
-			const int high = min(rect.z, x+f+1);
-			float sum = 0.0f;
-			for(int x1 = low; x1 < high; x1++) {
-				sum += difference_image[y*stride + x1];
-			}
-			float weight = sum * (1.0f/(high - low));
-			accum_image[y*stride + x] += weight;
-			out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
+		for(int x = aligned_lowx; x < rect.z; x += 4) {
+			int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+			int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
+
+			int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+
+			float4 weight = load4_a(temp_image, idx_p);
+			load4_a(accum_image, idx_p) += mask(active, weight);
+
+			float4 val = load4_u(image, idx_q);
+
+			load4_a(out_image, idx_p) += mask(active, weight*val);
 		}
 	}
 }
@@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
 	}
 }
 
+#undef load4_a
+#undef load4_u
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index 4cfbe21685c..b6b58b52a29 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -379,7 +379,7 @@ ccl_device_forceinline bool cardinal_curve_intersect(
 					float inv_mw_extension = 1.0f/mw_extension;
 					if(d0 >= 0)
 						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
-					else // inside
+					else  // inside
 						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
 				}
 
@@ -817,16 +817,24 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
 			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
-			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
-			sd->Ng = normalize(P - p_curr);
+#ifdef __EMBREE__
+ 			if(kernel_data.bvh.scene) {
+ 				sd->Ng = normalize(isect->Ng);
+ 			}
+ 			else
+#endif
+			{
+				/* direction from inside to surface of curve */
+				float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+				sd->Ng = normalize(P - p_curr);
 
-			/* adjustment for changing radius */
-			float gd = isect->v;
+				/* adjustment for changing radius */
+				float gd = isect->v;
 
-			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				if(gd != 0.0f) {
+					sd->Ng = sd->Ng - gd * tg;
+					sd->Ng = normalize(sd->Ng);
+				}
 			}
 		}
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index cfe17e63627..669c932d720 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -78,6 +78,12 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int
 	const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1;
 
 	Transform tfm;
+#ifdef __EMBREE__
+	if(kernel_data.bvh.scene) {
+		transform_motion_array_interpolate_straight(&tfm, motion, num_steps, time);
+	}
+	else
+#endif
 	transform_motion_array_interpolate(&tfm, motion, num_steps, time);
 
 	return tfm;
@@ -304,6 +310,24 @@ ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 	return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
+/* Cryptomatte ID */
+
+ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0.0f;
+
+	return kernel_tex_fetch(__objects, object).cryptomatte_object;
+}
+
+ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0;
+
+	return kernel_tex_fetch(__objects, object).cryptomatte_asset;
+}
+
 /* Particle data from which object was instanced */
 
 ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 00ce89ae567..8c0d0a9770e 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -146,7 +146,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		return a;
 	}
 	else
-#endif /* __PATCH_EVAL__ */
+#endif  /* __PATCH_EVAL__ */
 	if(desc.element == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
@@ -271,7 +271,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		return a;
 	}
 	else
-#endif /* __PATCH_EVAL__ */
+#endif  /* __PATCH_EVAL__ */
 	if(desc.element == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index aa6b102a0f3..57f4c86d403 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -71,28 +71,23 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 }
 
 #ifdef __KERNEL_AVX2__
-
 #define	cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
-#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
-ccl_device_inline
-#else
-ccl_device_forceinline
-#endif
-int ray_triangle_intersect8(KernelGlobals *kg,
-                            float3 ray_P,
-                            float3 ray_dir,
-                            Intersection **isect,
-                            uint visibility,
-                            int object,
-                            __m256 *triA,
-                            __m256 *triB,
-                            __m256 *triC,
-                            int prim_addr,
-                            int prim_num,
-                            uint *num_hits,
-                            uint max_hits,
-                            int *num_hits_in_instance,
-                            float isec_t)
+ccl_device_inline int ray_triangle_intersect8(
+            KernelGlobals *kg,
+            float3 ray_P,
+            float3 ray_dir,
+            Intersection **isect,
+            uint visibility,
+            int object,
+            __m256 *triA,
+            __m256 *triB,
+            __m256 *triC,
+            int prim_addr,
+            int prim_num,
+            uint *num_hits,
+            uint max_hits,
+            int *num_hits_in_instance,
+            float isect_t)
 {
 
 	const unsigned char prim_num_mask = (1 << prim_num) - 1;
@@ -108,10 +103,6 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
 
 	/* Calculate vertices relative to ray origin. */
-	/*	const float3 v0 = tri_c - P;
-	const float3 v1 = tri_a - P;
-	const float3 v2 = tri_b - P; */
-
 	__m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
 	__m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
 	__m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
@@ -136,11 +127,7 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	__m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
 	__m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
 
-	/* Calculate triangle edges.
-	const float3 e0 = v2 - v0;
-	const float3 e1 = v0 - v1;
-	const float3 e2 = v1 - v2;*/
-
+	/* Calculate triangle edges. */
 	__m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
 	__m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
 	__m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
@@ -153,48 +140,32 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	__m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
 	__m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
 
-	/* Perform edge tests.
-	const float U = dot(cross(v2 + v0, e0), ray_dir);
-	const float V = dot(cross(v0 + v1, e1), ray_dir);
-	const float W = dot(cross(v1 + v2, e2), ray_dir);*/
-
-	//cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx)
+	/* Perform edge tests. */
+	/* cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx) */
 	__m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
 	__m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
 	__m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
-	//vertical dot
+	/* vertical dot */
 	__m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
-	U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
-	U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
+	U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256);
+	U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256);
 
 	__m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
 	__m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
 	__m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
-	//vertical dot
+	/* vertical dot */
 	__m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
-	V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
-	V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
+	V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);
+	V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);
 
 	__m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
 	__m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
 	__m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
-	//vertical dot
+	/* vertical dot */
 	__m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
-	W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
-	W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
-
-	//const float minUVW = min(U, min(V, W));
-	//const float maxUVW = max(U, max(V, W));
-#if 0
-	__m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
-	__m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
-
-	//if(minUVW < 0.0f && maxUVW > 0.0f)
-	__m256i mask_minmaxUVW_256 = _mm256_and_si256(
-		_mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
-		//_mm256_castps_si256(minUVW_256),
-		_mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
-#else
+	W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);
+	W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);
+
 	__m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
 	__m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
 	__m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
@@ -204,9 +175,8 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	const __m256i two256 = _mm256_set1_epi32(2);
 
 	__m256i mask_minmaxUVW_256 = _mm256_or_si256(
-		_mm256_cmpeq_epi32(one256, UVW_256_1),
-		_mm256_cmpeq_epi32(two256, UVW_256_1) );
-#endif
+	        _mm256_cmpeq_epi32(one256, UVW_256_1),
+	        _mm256_cmpeq_epi32(two256, UVW_256_1));
 
 	unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
 	if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
@@ -214,231 +184,187 @@ int ray_triangle_intersect8(KernelGlobals *kg,
 	}
 
 	/* Calculate geometry normal and denominator. */
-	//			const float3 Ng1 = cross(e1, e0);
-	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
-
 	__m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
 	__m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
 	__m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
 
-	//const float3 Ng = Ng1 + Ng1;
 	Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
 	Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
 	Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
 
-	//const float den = dot3(Ng, dir);
-	//vertical dot
+	/* vertical dot */
 	__m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
-	den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
-	den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
-
-	// __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+	den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);
+	den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);
 
 	/* Perform depth test. */
-	//const float T = dot3(v0, Ng);
 	__m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
-	T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
-	T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
+	T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);
+	T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);
 
-	//const int sign_den = (__float_as_int(den) & 0x80000000);
 	const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
 	__m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
 
-	//const float sign_T = xor_signmask(T, sign_den);
 	__m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
 
-	/*if((sign_T < 0.0f) || mask_minmaxUVW_pos {	return false;}	*/
 	unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
 	if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
 		return false;
-	} /**/
+	} 
 
 	__m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
 
-
 	ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
 	ccl_align(32) unsigned int mask_minmaxUVW8[8];
 
-	if(visibility == PATH_RAY_SHADOW_OPAQUE){
-			__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
-
-			__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
-
-			__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
-			__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
-
-			__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
-				_mm256_castps_si256(
-					_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
-				)
-			);
-			/*	__m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
-			_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
-			_CMP_GT_OS
-			) );*/
-
-			mask0 = _mm256_or_si256(mask1, mask0);
-			//unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
-			//unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
-			//unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
-			mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
-			mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
-
-			unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
-			if((mask_final & prim_num_mask) == 0) { //all bits NOT set
-				return false;
-			}		/**/
-
-			unsigned long i = 0;
-#if defined(_MSC_VER)
-			unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
-#else
-            i = __builtin_ffs(mask_final)-1;
-#endif
-
-			den_256 = _mm256_rcp_ps(den_256); //inv_den
-			U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
-			V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
-			T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
-
-			_mm256_store_ps(U8, U_256);
-			_mm256_store_ps(V8, V_256);
-			_mm256_store_ps(T8, T_256);
-
-
-			//here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
-
-			(*isect)->u = U8[i];
-			(*isect)->v = V8[i];
-			(*isect)->t = T8[i];
-
-			(*isect)->prim = (prim_addr + i);
-			(*isect)->object = object;
-			(*isect)->type = PRIMITIVE_TRIANGLE;
-
-			return true;
+	if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+		__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);
+		__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+		__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
+		__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
+		__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
+			_mm256_castps_si256(
+				_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
+			)
+		);
+		mask0 = _mm256_or_si256(mask1, mask0);
+		mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
+		mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
+		unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
+		if((mask_final & prim_num_mask) == 0) {
+			return false;
 		}
+		const int i = __bsf(mask_final);
+		__m256 inv_den_256 = _mm256_rcp_ps(den_256);
+		U_256 = _mm256_mul_ps(U_256, inv_den_256);
+		V_256 = _mm256_mul_ps(V_256, inv_den_256);
+		T_256 = _mm256_mul_ps(T_256, inv_den_256);
+		_mm256_store_ps(U8, U_256);
+		_mm256_store_ps(V8, V_256);
+		_mm256_store_ps(T8, T_256);
+		/* NOTE: Here we assume visibility for all triangles in the node is
+		 * the same. */
+		(*isect)->u = U8[i];
+		(*isect)->v = V8[i];
+		(*isect)->t = T8[i];
+		(*isect)->prim = (prim_addr + i);
+		(*isect)->object = object;
+		(*isect)->type = PRIMITIVE_TRIANGLE;
+		return true;
+	}
 	else {
-			_mm256_store_ps(den8, den_256);
-			_mm256_store_ps(U8, U_256);
-			_mm256_store_ps(V8, V_256);
-			_mm256_store_ps(T8, T_256);
+		_mm256_store_ps(den8, den_256);
+		_mm256_store_ps(U8, U_256);
+		_mm256_store_ps(V8, V_256);
+		_mm256_store_ps(T8, T_256);
 
-			_mm256_store_ps(sign_T8, sign_T_256);
-			_mm256_store_ps(xor_signmask8, xor_signmask_256);
-			_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
+		_mm256_store_ps(sign_T8, sign_T_256);
+		_mm256_store_ps(xor_signmask8, xor_signmask_256);
+		_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
 
-			int ret = false;
+		int ret = false;
 
-			if(visibility == PATH_RAY_SHADOW) {
-				for(int i = 0; i < prim_num; i++) {
-					if(!mask_minmaxUVW8[i]) {
+		if(visibility == PATH_RAY_SHADOW) {
+			for(int i = 0; i < prim_num; i++) {
+				if(mask_minmaxUVW8[i]) {
+					continue;
+				}
 #ifdef __VISIBILITY_FLAG__
-						if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+				if((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
+					continue;
+				}
 #endif
-						{
-							if((sign_T8[i] >= 0.0f) &&
-							   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
-							{
-								if(den8[i]) {
-									const float inv_den = 1.0f / den8[i];
-
-									(*isect)->u = U8[i] * inv_den;
-									(*isect)->v = V8[i] * inv_den;
-									(*isect)->t = T8[i] * inv_den;
-
-									(*isect)->prim = (prim_addr + i);
-									(*isect)->object = object;
-									(*isect)->type = PRIMITIVE_TRIANGLE;
-
-									int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
-									int shader = 0;
-
+				if((sign_T8[i] < 0.0f) ||
+				   (sign_T8[i] > (*isect)->t * xor_signmask8[i]))
+				{
+					continue;
+				}
+				if(!den8[i]) {
+					continue;
+				}
+				const float inv_den = 1.0f / den8[i];
+				(*isect)->u = U8[i] * inv_den;
+				(*isect)->v = V8[i] * inv_den;
+				(*isect)->t = T8[i] * inv_den;
+				(*isect)->prim = (prim_addr + i);
+				(*isect)->object = object;
+				(*isect)->type = PRIMITIVE_TRIANGLE;
+				const int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
+				int shader = 0;
 #ifdef __HAIR__
-									if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
+				if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
-									{
-										shader = kernel_tex_fetch(__tri_shader, prim);
-									}
+				{
+					shader = kernel_tex_fetch(__tri_shader, prim);
+				}
 #ifdef __HAIR__
-									else {
-										float4 str = kernel_tex_fetch(__curves, prim);
-										shader = __float_as_int(str.z);
-									}
+				else {
+					float4 str = kernel_tex_fetch(__curves, prim);
+					shader = __float_as_int(str.z);
+				}
 #endif
-									int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-									/* if no transparent shadows, all light is blocked */
-									if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-										return 2;
-									}
-									/* if maximum number of hits reached, block all light */
-									else if(*num_hits == max_hits) {
-										return 2;
-									}
-									/* move on to next entry in intersections array */
-									ret = true;
-
-									(*isect)++;
-									(*num_hits)++;
-
-									(*num_hits_in_instance)++;
-
-									(*isect)->t = isec_t;
-
-								} //den
-							} //if sign
-						} //vis
-					}//if mask
-				} //for
+				const int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+				/* If no transparent shadows, all light is blocked. */
+				if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+					return 2;
+				}
+				/* If maximum number of hits reached, block all light. */
+				else if(num_hits == NULL || *num_hits == max_hits) {
+					return 2;
+				}
+				/* Move on to next entry in intersections array. */
+				ret = true;
+				(*isect)++;
+				(*num_hits)++;
+				(*num_hits_in_instance)++;
+				(*isect)->t = isect_t;
+			}
 		}
-		else { //default case
+		else {
 			for(int i = 0; i < prim_num; i++) {
-				if(!mask_minmaxUVW8[i]) {
+				if(mask_minmaxUVW8[i]) {
+					continue;
+				}
 #ifdef __VISIBILITY_FLAG__
-					if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+				if((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
+					continue;
+				}
 #endif
-					{
-						if((sign_T8[i] >= 0.0f) &&
-						   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
-						{
-							if(den8[i]) {
-								const float inv_den = 1.0f / den8[i];
-
-								(*isect)->u = U8[i] * inv_den;
-								(*isect)->v = V8[i] * inv_den;
-								(*isect)->t = T8[i] * inv_den;
-
-								(*isect)->prim = (prim_addr + i);
-								(*isect)->object = object;
-								(*isect)->type = PRIMITIVE_TRIANGLE;
-
-								ret = true;
-							} //den
-						} //if sign
-					} //vis
-				}//if mask
-			} //for
-		} //default
-	return ret;
-}// else PATH_RAY_SHADOW_OPAQUE
-
+				if((sign_T8[i] < 0.0f) ||
+				   (sign_T8[i] > (*isect)->t * xor_signmask8[i]))
+				{
+					continue;
+				}
+				if(!den8[i]) {
+					continue;
+				}
+				const float inv_den = 1.0f / den8[i];
+				(*isect)->u = U8[i] * inv_den;
+				(*isect)->v = V8[i] * inv_den;
+				(*isect)->t = T8[i] * inv_den;
+				(*isect)->prim = (prim_addr + i);
+				(*isect)->object = object;
+				(*isect)->type = PRIMITIVE_TRIANGLE;
+				ret = true;
+			}
+		}
+		return ret;
+	}
 }
 
-//vz static
-ccl_device_inline
-int triangle_intersect8(KernelGlobals *kg,
-                        Intersection **isect,
-                        float3 P,
-                        float3 dir,
-                        uint visibility,
-                        int object,
-                        int prim_addr,
-                        int prim_num,
-                        uint *num_hits,
-                        uint max_hits,
-                        int *num_hits_in_instance,
-                        float isec_t)
+ccl_device_inline int triangle_intersect8(
+        KernelGlobals *kg,
+        Intersection **isect,
+        float3 P,
+        float3 dir,
+        uint visibility,
+        int object,
+        int prim_addr,
+        int prim_num,
+        uint *num_hits,
+        uint max_hits,
+        int *num_hits_in_instance,
+        float isect_t)
  {
 	__m128 tri_a[8], tri_b[8], tri_c[8];
 	__m256  tritmp[12], tri[12];
@@ -540,11 +466,11 @@ int triangle_intersect8(KernelGlobals *kg,
 	                                     num_hits,
 	                                     max_hits,
 	                                     num_hits_in_instance,
-	                                     isec_t);
+	                                     isect_t);
 	return result;
 }
 
-#endif /* __KERNEL_AVX2__ */
+#endif  /* __KERNEL_AVX2__ */
 
 /* Special ray intersection routines for subsurface scattering. In that case we
  * only want to intersect with primitives in the same object, and if case of
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 373324afb01..1c8c91d15e6 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -63,4 +63,4 @@ void kernel_tex_copy(KernelGlobals *kg,
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_H__ */
+#endif  /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 990e798543a..ea478a8a5d3 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -35,4 +35,4 @@ ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_COLOR_H__ */
+#endif  /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index aa7a16afa1d..4ee80850402 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -153,4 +153,4 @@ typedef vector3<avxf> avx3f;
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_COMPAT_CPU_H__ */
+#endif  /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index ac63bcf7ac9..8ed96bbae64 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -150,4 +150,4 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
 
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+#endif  /* __KERNEL_COMPAT_CUDA_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 3f7e264fbee..21a95098894 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -159,4 +159,4 @@
 #include "util/util_half.h"
 #include "util/util_types.h"
 
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
+#endif  /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 74cfacb5bc1..37402f42863 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -21,6 +21,7 @@
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_vector.h"
+#  include "util/util_map.h"
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -42,6 +43,8 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #  endif
 
+typedef unordered_map<float, float> CoverageMap;
+
 struct Intersection;
 struct VolumeStep;
 
@@ -68,6 +71,11 @@ typedef struct KernelGlobals {
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
 
+	/* A buffer for storing per-pixel coverage for Cryptomatte. */
+	CoverageMap *coverage_object;
+	CoverageMap *coverage_material;
+	CoverageMap *coverage_asset;
+
 	/* split kernel */
 	SplitData split_data;
 	SplitParams split_param_data;
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
new file mode 100644
index 00000000000..ee3b8b8abfb
--- /dev/null
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -0,0 +1,94 @@
+/*
+* Copyright 2018 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, int num_slots, float id, float weight)
+{
+	kernel_assert(id != ID_NONE);
+	if(weight == 0.0f) {
+		return;
+	}
+	
+	for(int slot = 0; slot < num_slots; slot++) {
+		ccl_global float2 *id_buffer = (ccl_global float2*)buffer;
+#ifdef __ATOMIC_PASS_WRITE__
+		/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+		if(id_buffer[slot].x == ID_NONE) {
+			/* Use an atomic to claim this slot.
+			* If a different thread got here first, try again from this slot on. */
+			float old_id = atomic_compare_and_swap_float(buffer+slot*2, ID_NONE, id);
+			if(old_id != ID_NONE && old_id != id) {
+				continue;
+			}
+			atomic_add_and_fetch_float(buffer+slot*2+1, weight);
+			break;
+		}
+		/* If there already is a slot for that ID, add the weight.
+		 * If no slot was found, add it to the last. */
+		else if(id_buffer[slot].x == id || slot == num_slots - 1) {
+			atomic_add_and_fetch_float(buffer+slot*2+1, weight);
+			break;
+		}
+#else  /* __ATOMIC_PASS_WRITE__ */
+		/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+		if(id_buffer[slot].x == ID_NONE) {
+			id_buffer[slot].x = id;
+			id_buffer[slot].y = weight;
+			break;
+		}
+		/* If there already is a slot for that ID, add the weight.
+		* If no slot was found, add it to the last. */
+		else if(id_buffer[slot].x == id || slot == num_slots - 1) {
+			id_buffer[slot].y += weight;
+			break;
+		}
+#endif  /* __ATOMIC_PASS_WRITE__ */
+	}
+}
+
+ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
+{
+	ccl_global float2 *id_buffer = (ccl_global float2*)buffer;
+	for(int slot = 1; slot < num_slots; ++slot) {
+		if(id_buffer[slot].x == ID_NONE) {
+			return;
+		}
+		/* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
+		int i = slot;
+		while(i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
+			float2 swap = id_buffer[i];
+			id_buffer[i] = id_buffer[i - 1];
+			id_buffer[i - 1] = swap;
+			--i;
+		}
+	}
+}
+
+#ifdef __KERNEL_GPU__
+/* post-sorting for Cryptomatte */
+ccl_device void kernel_cryptomatte_post(KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+{
+	if(sample - 1 == kernel_data.integrator.aa_samples) {
+		int index = offset + x + y * stride;
+		int pass_stride = kernel_data.film.pass_stride;
+		ccl_global float *cryptomatte_buffer = buffer + index * pass_stride + kernel_data.film.pass_cryptomatte;
+		kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+	}
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..a8a43f3ea4a 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -25,4 +25,4 @@
 #include "util/util_texture.h"
 #include "util/util_transform.h"
 
-#endif /* __KERNEL_MATH_H__ */
+#endif  /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index 9b96bb80c32..dde93844dd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -187,7 +187,10 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u,
 ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 {
 	float3 R = 2*dot(N, I)*N - I;
-	if(dot(Ng, R) >= 0.05f) {
+
+	/* Reflection rays may always be at least as shallow as the incoming ray. */
+	float threshold = min(0.9f*dot(Ng, I), 0.01f);
+	if(dot(Ng, R) >= threshold) {
 		return N;
 	}
 
@@ -195,24 +198,88 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 	 * The X axis is found by normalizing the component of N that's orthogonal to Ng.
 	 * The Y axis isn't actually needed.
 	 */
-	float3 X = normalize(N - dot(N, Ng)*Ng);
-
-	/* Calculate N.z and N.x in the local coordinate system. */
-	float Iz = dot(I, Ng);
-	float Ix2 = sqr(dot(I, X)), Iz2 = sqr(Iz);
-	float Ix2Iz2 = Ix2 + Iz2;
-
-	float a = safe_sqrtf(Ix2*(Ix2Iz2 - sqr(0.05f)));
-	float b = Iz*0.05f + Ix2Iz2;
-	float c = (a + b > 0.0f)? (a + b) : (-a + b);
+	float NdotNg = dot(N, Ng);
+	float3 X = normalize(N - NdotNg*Ng);
+
+	/* Calculate N.z and N.x in the local coordinate system.
+	 *
+	 * The goal of this computation is to find a N' that is rotated towards Ng just enough
+	 * to lift R' above the threshold (here called t), therefore dot(R', Ng) = t.
+	 *
+	 * According to the standard reflection equation, this means that we want dot(2*dot(N', I)*N' - I, Ng) = t.
+	 *
+	 * Since the Z axis of our local coordinate system is Ng, dot(x, Ng) is just x.z, so we get 2*dot(N', I)*N'.z - I.z = t.
+	 *
+	 * The rotation is simple to express in the coordinate system we formed - since N lies in the X-Z-plane, we know that
+	 * N' will also lie in the X-Z-plane, so N'.y = 0 and therefore dot(N', I) = N'.x*I.x + N'.z*I.z .
+	 *
+	 * Furthermore, we want N' to be normalized, so N'.x = sqrt(1 - N'.z^2).
+	 *
+	 * With these simplifications, we get the final equation 2*(sqrt(1 - N'.z^2)*I.x + N'.z*I.z)*N'.z - I.z = t.
+	 *
+	 * The only unknown here is N'.z, so we can solve for that.
+	 *
+	 * The equation has four solutions in general:
+	 *
+	 * N'.z = +-sqrt(0.5*(+-sqrt(I.x^2*(I.x^2 + I.z^2 - t^2)) + t*I.z + I.x^2 + I.z^2)/(I.x^2 + I.z^2))
+	 * We can simplify this expression a bit by grouping terms:
+	 *
+	 * a = I.x^2 + I.z^2
+	 * b = sqrt(I.x^2 * (a - t^2))
+	 * c = I.z*t + a
+	 * N'.z = +-sqrt(0.5*(+-b + c)/a)
+	 *
+	 * Two solutions can immediately be discarded because they're negative so N' would lie in the lower hemisphere.
+	 */
+	float Ix = dot(I, X), Iz = dot(I, Ng);
+	float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+	float a = Ix2 + Iz2;
+
+	float b = safe_sqrtf(Ix2*(a - sqr(threshold)));
+	float c = Iz*threshold + a;
+
+	/* Evaluate both solutions.
+	 * In many cases one can be immediately discarded (if N'.z would be imaginary or larger than one), so check for that first.
+	 * If no option is viable (might happen in extreme cases like N being in the wrong hemisphere), give up and return Ng. */
+	float fac = 0.5f/a;
+	float N1_z2 = fac*(b+c), N2_z2 = fac*(-b+c);
+	bool valid1 = (N1_z2 > 1e-5f) && (N1_z2 <= (1.0f + 1e-5f));
+	bool valid2 = (N2_z2 > 1e-5f) && (N2_z2 <= (1.0f + 1e-5f));
+
+	float2 N_new;
+	if(valid1 && valid2) {
+		/* If both are possible, do the expensive reflection-based check. */
+		float2 N1 = make_float2(safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2));
+		float2 N2 = make_float2(safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2));
+
+		float R1 = 2*(N1.x*Ix + N1.y*Iz)*N1.y - Iz;
+		float R2 = 2*(N2.x*Ix + N2.y*Iz)*N2.y - Iz;
+
+		valid1 = (R1 >= 1e-5f);
+		valid2 = (R2 >= 1e-5f);
+		if(valid1 && valid2) {
+			/* If both solutions are valid, return the one with the shallower reflection since it will be closer to the input
+			 * (if the original reflection wasn't shallow, we would not be in this part of the function). */
+			N_new = (R1 < R2)? N1 : N2;
+		}
+		else {
+			/* If only one reflection is valid (= positive), pick that one. */
+			N_new = (R1 > R2)? N1 : N2;
+		}
 
-	float Nz = safe_sqrtf(0.5f * c * (1.0f / Ix2Iz2));
-	float Nx = safe_sqrtf(1.0f - sqr(Nz));
+	}
+	else if(valid1 || valid2) {
+		/* Only one solution passes the N'.z criterium, so pick that one. */
+		float Nz2 = valid1? N1_z2 : N2_z2;
+		N_new = make_float2(safe_sqrtf(1.0f - Nz2), safe_sqrtf(Nz2));
+	}
+	else {
+		return Ng;
+	}
 
-	/* Transform back into global coordinates. */
-	return Nx*X + Nz*Ng;
+	return N_new.x*X + N_new.y*Ng;
 }
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_MONTECARLO_CL__ */
+#endif  /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 458aa6c2a97..80477f921ea 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
 #if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
 #define __ATOMIC_PASS_WRITE__
 #endif
 
+#include "kernel/kernel_id_passes.h"
+
+CCL_NAMESPACE_BEGIN
+
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
 {
 	ccl_global float *buf = buffer;
@@ -108,7 +110,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob
 	float value = path_total_shaded / max(path_total, 1e-7f);
 	kernel_write_pass_float(buffer+2, value*value);
 }
-#endif /* __DENOISING_FEATURES__ */
+#endif  /* __DENOISING_FEATURES__ */
 
 ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
                                                         ShaderData *sd,
@@ -187,7 +189,24 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
 		                        L->debug_data.num_ray_bounces);
 	}
 }
-#endif /* __KERNEL_DEBUG__ */
+#endif  /* __KERNEL_DEBUG__ */
+
+#ifdef __KERNEL_CPU__
+#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
+ccl_device_inline size_t kernel_write_id_pass_cpu(float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+{
+	if(map) {
+		(*map)[id] += matte_weight;
+		return 0;
+	}
+#else  /* __KERNEL_CPU__ */
+#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight) 
+ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer, size_t depth, float id, float matte_weight)
+{
+#endif  /* __KERNEL_CPU__ */
+	kernel_write_id_slots(buffer, depth, id, matte_weight);
+	return depth * 2;
+}
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
@@ -242,6 +261,26 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		}
 	}
 
+	if(kernel_data.film.cryptomatte_passes) {
+		const float matte_weight = average(throughput) * (1.0f - average(shader_bsdf_transparency(kg, sd)));
+		if(matte_weight > 0.0f) {
+			ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+			if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+				float id = object_cryptomatte_id(kg, sd->object);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+			}
+			if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+				float id = shader_cryptomatte_id(kg, sd->shader);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+			}
+			if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+				float id = object_cryptomatte_asset_id(kg, sd->object);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+			}
+		}
+	}
+
+
 	if(light_flag & PASSMASK_COMPONENT(DIFFUSE))
 		L->color_diffuse += shader_bsdf_diffuse(kg, sd)*throughput;
 	if(light_flag & PASSMASK_COMPONENT(GLOSSY))
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 5745762e183..cb1f410b09f 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -266,7 +266,7 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
 }
 #endif  /* __VOLUME__ */
 
-#endif /* __SPLIT_KERNEL__ */
+#endif  /* __SPLIT_KERNEL__ */
 
 ccl_device_forceinline bool kernel_path_shader_apply(
 	KernelGlobals *kg,
@@ -434,7 +434,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		else if(result == VOLUME_PATH_MISSED) {
 			break;
 		}
-#endif /* __VOLUME__*/
+#endif  /* __VOLUME__*/
 
 		/* Shade background. */
 		if(!hit) {
@@ -557,7 +557,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 }
 
-#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
+#endif  /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
 ccl_device_forceinline void kernel_path_integrate(
 	KernelGlobals *kg,
@@ -605,7 +605,7 @@ ccl_device_forceinline void kernel_path_integrate(
 		else if(result == VOLUME_PATH_MISSED) {
 			break;
 		}
-#endif /* __VOLUME__*/
+#endif  /* __VOLUME__*/
 
 		/* Shade background. */
 		if(!hit) {
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index feaea15d3c4..d2506fc1e7e 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
 			}
 		}
 	}
-#endif /* __EMISSION__ */
+#endif  /* __EMISSION__ */
 }
 
 #ifdef __KERNEL_GPU__
@@ -277,10 +277,10 @@ ccl_device void kernel_branched_path_volume_connect_light(
 			}
 		}
 	}
-#endif /* __EMISSION__ */
+#endif  /* __EMISSION__ */
 }
-#endif /* __SPLIT_KERNEL__ */
+#endif  /* __SPLIT_KERNEL__ */
 
-#endif /* __VOLUME_SCATTER__ */
+#endif  /* __VOLUME_SCATTER__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index e32d4bbbc1b..de8cc4a0cef 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -145,4 +145,4 @@ ccl_device int dequeue_ray_index(
 
 CCL_NAMESPACE_END
 
-#endif // __KERNEL_QUEUE_H__
+#endif  // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index b33e4eba8a4..61ddf4a4f81 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -50,7 +50,7 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 	return result;
 }
 
-#endif /* __SOBOL__ */
+#endif  /* __SOBOL__ */
 
 
 ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index e834b701f96..af883aa715b 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -1276,4 +1276,9 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 }
 #endif  /* __TRANSPARENT_SHADOWS__ */
 
+ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+{
+	return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 8a0da6c3b13..fafa3ad4bfa 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -446,7 +446,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
 }
 
 #  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif /* __TRANSPARENT_SHADOWS__ */
+#endif  /* __TRANSPARENT_SHADOWS__ */
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
                                       ShaderData *sd,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index e93100a6442..864aa7c470a 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -17,6 +17,12 @@
 #ifndef __KERNEL_TYPES_H__
 #define __KERNEL_TYPES_H__
 
+#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
+#  include <embree3/rtcore.h>
+#  include <embree3/rtcore_scene.h>
+#  define __EMBREE__
+#endif
+
 #include "kernel/kernel_math.h"
 #include "kernel/svm/svm_types.h"
 #include "util/util_static_assert.h"
@@ -53,6 +59,7 @@ CCL_NAMESPACE_BEGIN
 #define OBJECT_NONE				(~0)
 #define PRIM_NONE				(~0)
 #define LAMP_NONE				(~0)
+#define ID_NONE					(0.0f)
 
 #define VOLUME_STACK_SIZE		32
 
@@ -415,6 +422,7 @@ typedef enum PassType {
 	PASS_RAY_BOUNCES,
 #endif
 	PASS_RENDER_TIME,
+	PASS_CRYPTOMATTE,
 	PASS_CATEGORY_MAIN_END = 31,
 
 	PASS_MIST = 32,
@@ -443,6 +451,14 @@ typedef enum PassType {
 
 #define PASS_ANY (~0)
 
+typedef enum CryptomatteType {
+	CRYPT_NONE = 0,
+	CRYPT_OBJECT = (1 << 0),
+	CRYPT_MATERIAL = (1 << 1),
+	CRYPT_ASSET = (1 << 2),
+	CRYPT_ACCURATE = (1 << 3),
+} CryptomatteType;
+
 typedef enum DenoisingPassOffsets {
 	DENOISING_PASS_NORMAL             = 0,
 	DENOISING_PASS_NORMAL_VAR         = 3,
@@ -599,7 +615,7 @@ typedef ccl_addr_space struct PathRadiance {
 
 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
-#endif /* __KERNEL_DEBUG__ */
+#endif  /* __KERNEL_DEBUG__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -712,6 +728,9 @@ typedef struct Ray {
 /* Intersection */
 
 typedef struct Intersection {
+#ifdef __EMBREE__
+	float3 Ng;
+#endif
 	float t, u, v;
 	int prim;
 	int object;
@@ -1260,6 +1279,9 @@ typedef struct KernelFilm {
 	int pass_shadow;
 	float pass_shadow_scale;
 	int filter_table_offset;
+	int cryptomatte_passes;
+	int cryptomatte_depth;
+	int pass_cryptomatte;
 
 	int pass_mist;
 	float mist_start;
@@ -1270,8 +1292,6 @@ typedef struct KernelFilm {
 	int pass_denoising_clean;
 	int denoising_flags;
 
-	int pad1, pad2, pad3;
-
 	/* XYZ to rendering color space transform. float4 instead of float3 to
 	 * ensure consistent padding/alignment across devices. */
 	float4 xyz_to_r;
@@ -1385,20 +1405,29 @@ typedef enum KernelBVHLayout {
 	BVH_LAYOUT_BVH2 = (1 << 0),
 	BVH_LAYOUT_BVH4 = (1 << 1),
 	BVH_LAYOUT_BVH8 = (1 << 2),
-
+	BVH_LAYOUT_EMBREE = (1 << 3),
 	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
 	BVH_LAYOUT_ALL = (unsigned int)(-1),
 } KernelBVHLayout;
 
 typedef struct KernelBVH {
-	/* root node */
+	/* Own BVH */
 	int root;
 	int have_motion;
 	int have_curves;
 	int have_instancing;
 	int bvh_layout;
 	int use_bvh_steps;
+
+	/* Embree */
+#ifdef __EMBREE__
+	RTCScene scene;
+#  ifndef __KERNEL_64_BIT__
+	int pad1;
+#  endif
+#else
 	int pad1, pad2;
+#endif
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
@@ -1460,7 +1489,11 @@ typedef struct KernelObject {
 	uint patch_map_offset;
 	uint attribute_map_offset;
 	uint motion_offset;
-	uint pad;
+	uint pad1;
+
+	float cryptomatte_object;
+	float cryptomatte_asset;
+	float pad2, pad3;
 } KernelObject;
 static_assert_align(KernelObject, 16);
 
@@ -1540,7 +1573,7 @@ static_assert_align(KernelParticle, 16);
 
 typedef struct KernelShader {
 	float constant_emission[3];
-	float pad1;
+	float cryptomatte_id;
 	int flags;
 	int pass_id;
 	int pad2, pad3;
@@ -1672,4 +1705,4 @@ typedef struct WorkTile {
 
 CCL_NAMESPACE_END
 
-#endif /*  __KERNEL_TYPES_H__ */
+#endif  /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index d71761a97bc..d6d283c42c5 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -87,7 +87,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
 	return true;
 }
 
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 ccl_device float3 volume_color_transmittance(float3 sigma, float t)
 {
@@ -270,7 +270,7 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
 		kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
 }
 
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 /* Equi-angular sampling as in:
  * "Importance Sampling Techniques for Path Tracing in Participating Media" */
@@ -1075,7 +1075,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	return VOLUME_PATH_SCATTERED;
 }
-#endif /* __SPLIT_KERNEL */
+#endif  /* __SPLIT_KERNEL */
 
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
@@ -1377,6 +1377,6 @@ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
 	}
 }
 
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index b62aa9663ec..e036b53b810 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 26777fdabb2..4c758711481 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
@@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index b77b7350d86..ae4fd85780d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -26,7 +26,7 @@ template<typename T> struct TextureInterpolator  {
 		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
 		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
 		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
+	} (void) 0
 
 	static ccl_always_inline float4 read(float4 r)
 	{
@@ -540,4 +540,4 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 
 CCL_NAMESPACE_END
 
-#endif // __KERNEL_CPU_IMAGE_H__
+#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 5ec1655ab05..759b7e4c20d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -97,7 +97,7 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 	{
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 /* Film */
@@ -120,7 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -141,7 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -176,7 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	else {
 		kernel_background_evaluate(kg, input, output, i);
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 #else  /* __SPLIT_KERNEL__ */
@@ -208,7 +208,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		ccl_local type locals; \
 		kernel_##name(kg, &locals); \
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 0561c40e6b1..b856cbde45c 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int w,
                                        int h,
                                        int stride,
-                                       int shift_stride,
+                                       int pass_stride,
                                        int r,
                                        int channel_offset,
                                        float a,
@@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
 		                                  weight_image,
 		                                  variance_image,
@@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
                             int w,
                             int h,
                             int stride,
-                            int shift_stride,
+                            int pass_stride,
                             int r,
                             int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_blur(co.x, co.y,
 		                       difference_image + ofs,
 		                       out_image + ofs,
@@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                    int w,
                                    int h,
                                    int stride,
-                                   int shift_stride,
+                                   int pass_stride,
                                    int r,
                                    int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_weight(co.x, co.y,
 		                              difference_image + ofs,
 		                              out_image + ofs,
@@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      int w,
                                      int h,
                                      int stride,
-                                     int shift_stride,
+                                     int pass_stride,
                                      int r,
                                      int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
 		                                difference_image + ofs,
 		                                image,
@@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
-                                         int f,
-                                         int pass_stride)
+                                         int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
 		kernel_filter_nlm_construct_gramian(co.x, co.y,
 		                                    co.z, co.w,
 		                                    difference_image + ofs,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 8a180a509e8..af311027f78 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -40,14 +40,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 
 #ifdef __BRANCHED_PATH__
@@ -56,14 +63,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
 kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+	
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 3c75754fb39..a550f97f4eb 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int w,
                                                     int h,
                                                     int stride,
-                                                    int shift_stride,
+                                                    int pass_stride,
                                                     int r,
                                                     int channel_offset,
                                                     float a,
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
 		                                  weight_image,
 		                                  variance_image,
@@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
                                          int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_blur(co.x, co.y,
 		                       difference_image + ofs,
 		                       out_image + ofs,
@@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest
                                                 int w,
                                                 int h,
                                                 int stride,
-                                                int shift_stride,
+                                                int pass_stride,
                                                 int r,
                                                 int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_weight(co.x, co.y,
 		                              difference_image + ofs,
 		                              out_image + ofs,
@@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                   int w,
                                                   int h,
                                                   int stride,
-                                                  int shift_stride,
+                                                  int pass_stride,
                                                   int r,
                                                   int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
 		                                difference_image + ofs,
 		                                image,
@@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int w,
                                                       int h,
                                                       int stride,
-                                                      int shift_stride,
+                                                      int pass_stride,
                                                       int r,
-                                                      int f,
-                                                      int pass_stride)
+                                                      int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
 		kernel_filter_nlm_construct_gramian(co.x, co.y,
 		                                    co.z, co.w,
 		                                    difference_image + ofs,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 63128d0aecf..de1f5088629 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -66,9 +66,17 @@ __kernel void kernel_ocl_path_trace(
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
+	bool thread_is_active = x < sx + sw && y < sy + sh;
+	if(thread_is_active) {
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
+	}
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(thread_is_active) {
+			kernel_cryptomatte_post(kg, buffer, sample, x, y, offset, stride);
+		}
+	}
 }
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index dd9d683e030..79af831c2fb 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -142,7 +142,7 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
 		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
 		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
+	} (void) 0
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index d9aeb9ab9fb..2a50704b569 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -146,4 +146,4 @@ CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure)
 
 CCL_NAMESPACE_END
 
-#endif /* __OSL_CLOSURES_H__ */
+#endif  /* __OSL_CLOSURES_H__ */
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 30b29793e2d..88192fbcccb 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -94,4 +94,4 @@ CCL_NAMESPACE_END
 
 #endif
 
-#endif /* __OSL_GLOBALS_H__ */
+#endif  /* __OSL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 7902381440b..97f97a4887e 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -884,6 +884,23 @@ bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlo
 	return false; /* never called by OSL */
 }
 
+TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring filename)
+{
+	if(filename.length() && filename[0] == '@') {
+		/* Dummy, we don't use texture handles for builtin textures but need
+		 * to tell the OSL runtime optimizer that this is a valid texture. */
+		return NULL;
+	}
+	else {
+		return texturesys()->get_texture_handle(filename);
+	}
+}
+
+bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
+{
+	return texturesys()->good(texture_handle);
+}
+
 bool OSLRenderServices::texture(ustring filename,
                                 TextureHandle *texture_handle,
                                 TexturePerthread *texture_thread_info,
@@ -894,7 +911,8 @@ bool OSLRenderServices::texture(ustring filename,
                                 int nchannels,
                                 float *result,
                                 float *dresultds,
-                                float *dresultdt)
+                                float *dresultdt,
+                                ustring *errormessage)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -1035,7 +1053,7 @@ bool OSLRenderServices::texture(ustring filename,
 		 * other nasty stuff happening.
 		 */
 		string err = ts->geterror();
-		(void)err;
+		(void) err;
 	}
 
 	return status;
@@ -1114,7 +1132,7 @@ bool OSLRenderServices::texture3d(ustring filename,
 		 * other nasty stuff happening.
 		 */
 		string err = ts->geterror();
-		(void)err;
+		(void) err;
 	}
 
 	return status;
@@ -1156,7 +1174,13 @@ bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filenam
                                          TypeDesc datatype, void *data)
 {
 	OSL::TextureSystem *ts = osl_ts;
-	return ts->get_texture_info(filename, subimage, dataname, datatype, data);
+	if(filename.length() && filename[0] == '@') {
+		/* Special builtin textures. */
+		return false;
+	}
+	else {
+		return ts->get_texture_info(filename, subimage, dataname, datatype, data);
+	}
 }
 
 int OSLRenderServices::pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 &center,
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 50044746fd1..712b06b41b8 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -93,6 +93,10 @@ public:
 	bool getmessage(OSL::ShaderGlobals *sg, ustring source, ustring name,
 	                TypeDesc type, void *val, bool derivatives);
 
+	TextureSystem::TextureHandle *get_texture_handle(ustring filename);
+
+	bool good(TextureSystem::TextureHandle *texture_handle);
+
 	bool texture(ustring filename,
 	             TextureSystem::TextureHandle *texture_handle,
 	             TexturePerthread *texture_thread_info,
@@ -103,7 +107,8 @@ public:
 	             int nchannels,
 	             float *result,
 	             float *dresultds,
-	             float *dresultdt);
+	             float *dresultdt,
+	             ustring *errormessage);
 
 	bool texture3d(ustring filename,
 	               TextureHandle *texture_handle,
@@ -194,4 +199,4 @@ private:
 
 CCL_NAMESPACE_END
 
-#endif /* __OSL_SERVICES_H__  */
+#endif  /* __OSL_SERVICES_H__  */
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 6a690e880ad..a89bb3fd1a3 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -193,7 +193,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
 			float data[9];
 			bool found = kg->osl->services->get_attribute(sd, true, OSLRenderServices::u_empty, TypeDesc::TypeVector,
 			                                              OSLRenderServices::u_geom_undisplaced, data);
-			(void)found;
+			(void) found;
 			assert(found);
 
 			memcpy(&sd->P, data, sizeof(float)*3);
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 571a3f502be..9824f966a44 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -66,4 +66,4 @@ CCL_NAMESPACE_END
 
 #endif
 
-#endif /* __OSL_SHADER_H__ */
+#endif  /* __OSL_SHADER_H__ */
diff --git a/intern/cycles/kernel/shaders/oslutil.h b/intern/cycles/kernel/shaders/oslutil.h
index 141e5d27e3a..592a8ad12d9 100644
--- a/intern/cycles/kernel/shaders/oslutil.h
+++ b/intern/cycles/kernel/shaders/oslutil.h
@@ -92,4 +92,4 @@ float wireframe(string edge_type, float line_width) { return wireframe(edge_type
 float wireframe(string edge_type) { return wireframe(edge_type, 1.0, 1); }
 float wireframe() { return wireframe("polygons", 1.0, 1); }
 
-#endif /* CCL_OSLUTIL_H */
+#endif  /* CCL_OSLUTIL_H */
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 4a8378796ba..7136c746321 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -284,33 +284,63 @@ point rotate (point p, float angle, point a, point b)
 
 normal ensure_valid_reflection(normal Ng, vector I, normal N)
 {
+    /* The implementation here mirrors the one in kernel_montecarlo.h,
+     * check there for an explanation of the algorithm. */
+
     float sqr(float x) { return x*x; }
 
     vector R = 2*dot(N, I)*N - I;
-    if (dot(Ng, R) >= 0.05) {
+
+    float threshold = min(0.9*dot(Ng, I), 0.01);
+    if(dot(Ng, R) >= threshold) {
         return N;
     }
 
-    /* Form coordinate system with Ng as the Z axis and N inside the X-Z-plane.
-     * The X axis is found by normalizing the component of N that's orthogonal to Ng.
-     * The Y axis isn't actually needed.
-     */
-    vector X = normalize(N - dot(N, Ng)*Ng);
+    float NdotNg = dot(N, Ng);
+    vector X = normalize(N - NdotNg*Ng);
 
-    /* Calculate N.z and N.x in the local coordinate system. */
     float Ix = dot(I, X), Iz = dot(I, Ng);
-    float Ix2 = sqr(dot(I, X)), Iz2 = sqr(dot(I, Ng));
-    float Ix2Iz2 = Ix2 + Iz2;
-
-    float a = sqrt(Ix2*(Ix2Iz2 - sqr(0.05)));
-    float b = Iz*0.05 + Ix2Iz2;
-    float c = (a + b > 0.0)? (a + b) : (-a + b);
+    float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+    float a = Ix2 + Iz2;
+
+    float b = sqrt(Ix2*(a - sqr(threshold)));
+    float c = Iz*threshold + a;
+
+    float fac = 0.5/a;
+    float N1_z2 = fac*(b+c), N2_z2 = fac*(-b+c);
+    int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
+    int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
+
+    float N_new_x, N_new_z;
+    if(valid1 && valid2) {
+        float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
+        float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
+
+        float R1 = 2*(N1_x*Ix + N1_z*Iz)*N1_z - Iz;
+        float R2 = 2*(N2_x*Ix + N2_z*Iz)*N2_z - Iz;
+
+        valid1 = (R1 >= 1e-5);
+        valid2 = (R2 >= 1e-5);
+        if(valid1 && valid2) {
+            N_new_x = (R1 < R2)? N1_x : N2_x;
+            N_new_z = (R1 < R2)? N1_z : N2_z;
+        }
+        else {
+            N_new_x = (R1 > R2)? N1_x : N2_x;
+            N_new_z = (R1 > R2)? N1_z : N2_z;
+        }
 
-    float Nz = sqrt(0.5 * c * (1.0 / Ix2Iz2));
-    float Nx = sqrt(1.0 - sqr(Nz));
+    }
+    else if(valid1 || valid2) {
+        float Nz2 = valid1? N1_z2 : N2_z2;
+        N_new_x = sqrt(1.0 - Nz2);
+        N_new_z = sqrt(Nz2);
+    }
+    else {
+        return Ng;
+    }
 
-    /* Transform back into global coordinates. */
-    return Nx*X + Nz*Ng;
+    return N_new_x*X + N_new_z*Ng;
 }
 
 
@@ -485,7 +515,7 @@ float smooth_linearstep (float edge0, float edge1, float x_, float eps_) {
         else if (x >= eps && x <= 1.0-eps) result = x;
         else if (x >= 1.0+eps)             result = 1;
         else if (x < eps)                  result = rampup (x+eps, 2.0*eps);
-        else /* if (x < 1.0+eps) */        result = 1.0 - rampup (1.0+eps - x, 2.0*eps);
+        else  /* if (x < 1.0+eps) */        result = 1.0 - rampup (1.0+eps - x, 2.0*eps);
     } else {
         result = step (edge0, x_);
     }
@@ -656,4 +686,4 @@ int getmatrix (string fromspace, output matrix M) {
 #undef PERCOMP2
 #undef PERCOMP2F
 
-#endif /* CCL_STDOSL_H */
+#endif  /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 180c0b57077..18eec6372f1 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -80,8 +80,10 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	bool ray_was_updated = false;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		ray_was_updated = true;
 		uint sample = state->sample;
 		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
 		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
@@ -92,6 +94,17 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
+			uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+			ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+			ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+			kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+		}
+	}
+
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		/* We have completed current work; So get next work */
 		ccl_global uint *work_pools = kernel_split_params.work_pools;
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
index 2132c42220f..666355de334 100644
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -78,7 +78,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 			}
 		}
 	}
-#  endif /* __KERNEL_OPENCL__ */
+#  endif  /* __KERNEL_OPENCL__ */
 
 	/* copy to destination */
 	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
@@ -91,7 +91,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 			kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
 		}
 	}
-#endif /* __KERNEL_CUDA__ */
+#endif  /* __KERNEL_CUDA__ */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 9297e1e0ad5..3f6b3977d79 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
 {
-	(void)kg;  /* Unused on CPU. */
+	(void) kg;  /* Unused on CPU. */
 
 	uint64_t size = 0;
 #define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
@@ -48,7 +48,7 @@ ccl_device_inline void split_data_init(KernelGlobals *kg,
                                        ccl_global void *data,
                                        ccl_global char *ray_state)
 {
-	(void)kg;  /* Unused on CPU. */
+	(void) kg;  /* Unused on CPU. */
 
 	ccl_global char *p = (ccl_global char*)data;
 
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 56194d9f857..83df1e2a0a6 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -86,14 +86,14 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
 #else
 #  define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
+#endif  /* __SUBSURFACE__ */
 
 #ifdef __VOLUME__
 #  define SPLIT_DATA_VOLUME_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
 #else
 #  define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
+#endif  /* __VOLUME__ */
 
 #define SPLIT_DATA_ENTRIES \
 	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index ab69afa051e..ccb9aef7a5b 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -313,7 +313,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 			case NODE_LEAVE_BUMP_EVAL:
 				svm_node_leave_bump_eval(kg, sd, stack, node.y);
 				break;
-#      endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
+#      endif  /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
 #    endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
 			case NODE_HSV:
 				svm_node_hsv(kg, sd, stack, node, &offset);
@@ -497,4 +497,4 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_H__ */
+#endif  /* __SVM_H__ */
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 64bf8244999..3cf33f4d431 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -262,7 +262,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					        ? (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra))
 					        : NULL;
 
-					if (bsdf && extra) {
+					if(bsdf && extra) {
 						bsdf->N = N;
 						bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
 						bsdf->T = T;
@@ -285,7 +285,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						/* setup bsdf */
 						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
 							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
-						else /* use multi-scatter GGX */
+						else  /* use multi-scatter GGX */
 							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
 					}
 				}
@@ -314,7 +314,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 							        ? (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra))
 							        : NULL;
 
-							if (bsdf && extra) {
+							if(bsdf && extra) {
 								bsdf->N = N;
 								bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
 								bsdf->extra = extra;
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index 27127b85323..41538d1138d 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -59,4 +59,4 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_HSV_H__ */
+#endif  /* __SVM_HSV_H__ */
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index a3e4b6e87cd..6f39391057e 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -108,4 +108,4 @@ ccl_device void svm_node_curves(KernelGlobals *kg, ShaderData *sd, float *stack,
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_RAMP_H__ */
+#endif  /* __SVM_RAMP_H__ */
diff --git a/intern/cycles/kernel/svm/svm_ramp_util.h b/intern/cycles/kernel/svm/svm_ramp_util.h
index a67689ff9d1..847108ff1c2 100644
--- a/intern/cycles/kernel/svm/svm_ramp_util.h
+++ b/intern/cycles/kernel/svm/svm_ramp_util.h
@@ -95,4 +95,4 @@ ccl_device float float_ramp_lookup(const float *ramp,
 
 CCL_NAMESPACE_END
 
-#endif /* __SVM_RAMP_UTIL_H__ */
+#endif  /* __SVM_RAMP_UTIL_H__ */
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 910537a2539..0f1dfa4936b 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -531,4 +531,4 @@ typedef enum ClosureType {
 
 CCL_NAMESPACE_END
 
-#endif /*  __SVM_TYPES_H__ */
+#endif  /*  __SVM_TYPES_H__ */
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 7b60ab6e6ae..80b63dc80cd 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -24,7 +24,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, NodeWaveProfile profile, f
 
 	if(type == NODE_WAVE_BANDS)
 		n = (p.x + p.y + p.z) * 10.0f;
-	else /* NODE_WAVE_RINGS */
+	else  /* NODE_WAVE_RINGS */
 		n = len(p) * 20.0f;
 
 	if(distortion != 0.0f)
author	Stefan Werner <stefan.werner@tangent-animation.com>	2018-11-23 15:08:15 +0300
committer	Stefan Werner <stefan.werner@tangent-animation.com>	2018-11-23 15:19:53 +0300
commit	071f4f4ce0b9520ab0c73d6d68365ad449ca8b80 (patch)
tree	9f37bfcac669366b9ad5fb7605f2fbbed9b71b0a /intern/cycles/kernel
parent	0a2b2d59a5897212ba3771503feb6770fb636bc8 (diff)