From 4355603790712032e89fa4da6d8ce7f3ede62b4f Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 11 Jul 2016 12:28:45 +0200
Subject: Cycles: Move BVK kernel files to own directory

BVH traversal is not really that much a geometry and we've got
quite some traversals now. Makes sense to keep them separate in
the name of source structure clarity.
---
 intern/cycles/kernel/bvh/bvh_nodes.h | 656 +++++++++++++++++++++++++++++++++++
 1 file changed, 656 insertions(+)
 create mode 100644 intern/cycles/kernel/bvh/bvh_nodes.h

(limited to 'intern/cycles/kernel/bvh/bvh_nodes.h')

diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
new file mode 100644
index 00000000000..5b0d8785d0e
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright 2011-2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
+// 3-vector which might be faster.
+ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+                                                           int nodeAddr,
+                                                           int child)
+{
+	Transform space;
+	const int child_addr = nodeAddr + child * 3;
+	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
+	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
+	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
+	space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+	return space;
+}
+
+#if !defined(__KERNEL_SSE2__)
+ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+                                                 const float3 P,
+                                                 const float3 idir,
+                                                 const float t,
+                                                 const int nodeAddr,
+                                                 const uint visibility,
+                                                 float dist[2])
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+                                                        const float3 P,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const float difl,
+                                                        const float extmax,
+                                                        const int nodeAddr,
+                                                        const uint visibility,
+                                                        float dist[2])
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	if(difl != 0.0f) {
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        int nodeAddr,
+        int child,
+        float dist[2])
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
+	return tNear <= tFar;
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        const float difl,
+        int nodeAddr,
+        int child,
+        float dist[2])
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
+	if(difl != 0.0f) {
+		/* TODO(sergey): Same as for QBVH, needs a proper use. */
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		return round_down*tNear <= round_up*tFar;
+	}
+	else {
+		return tNear <= tFar;
+	}
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const float3 idir,
+                                                   const float t,
+                                                   const int nodeAddr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const float3 idir,
+                                                          const float t,
+                                                          const float difl,
+                                                          const float extmax,
+                                                          const int nodeAddr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3 P,
+                                         const float3 dir,
+                                         const float3 idir,
+                                         const float t,
+                                         const int nodeAddr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    idir,
+		                                    t,
+		                                    nodeAddr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  idir,
+		                                  t,
+		                                  nodeAddr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3 P,
+                                                const float3 dir,
+                                                const float3 idir,
+                                                const float t,
+                                                const float difl,
+                                                const float extmax,
+                                                const int nodeAddr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           idir,
+		                                           t,
+		                                           difl,
+		                                           extmax,
+		                                           nodeAddr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         idir,
+		                                         t,
+		                                         difl,
+		                                         extmax,
+		                                         nodeAddr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#else  /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_inline bvh_aligned_node_intersect(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_aligned_node_intersect_robust(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const float difl,
+        const float extmax,
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+
+	if(difl != 0.0f) {
+		float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+		float4 *tminmaxview = (float4*)&tminmax;
+		float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
+		float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const ssef& tnear,
+                                                   const ssef& tfar,
+                                                   const int nodeAddr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	sseb vmask = tNear <= tFar;
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const ssef& tnear,
+                                                          const ssef& tfar,
+                                                          const float difl,
+                                                          const int nodeAddr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	sseb vmask;
+	if(difl != 0.0f) {
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		vmask = round_down*tNear <= round_up*tFar;
+	}
+	else {
+		vmask = tNear <= tFar;
+	}
+
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3& P,
+                                         const float3& dir,
+                                         const ssef& tnear,
+                                         const ssef& tfar,
+                                         const ssef& tsplat,
+                                         const ssef Psplat[3],
+                                         const ssef idirsplat[3],
+                                         const shuffle_swap_t shufflexyz[3],
+                                         const int nodeAddr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    tnear,
+		                                    tfar,
+		                                    nodeAddr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  dir,
+		                                  tsplat,
+		                                  Psplat,
+		                                  idirsplat,
+		                                  shufflexyz,
+		                                  nodeAddr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3& P,
+                                                const float3& dir,
+                                                const ssef& tnear,
+                                                const ssef& tfar,
+                                                const ssef& tsplat,
+                                                const ssef Psplat[3],
+                                                const ssef idirsplat[3],
+                                                const shuffle_swap_t shufflexyz[3],
+                                                const float difl,
+                                                const float extmax,
+                                                const int nodeAddr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           tnear,
+		                                           tfar,
+		                                           difl,
+		                                           nodeAddr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         dir,
+		                                         tsplat,
+		                                         Psplat,
+		                                         idirsplat,
+		                                         shufflexyz,
+		                                         difl,
+		                                         extmax,
+		                                         nodeAddr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#endif  /* !defined(__KERNEL_SSE2__) */
-- 
cgit v1.2.3