1 files changed, 413 insertions, 486 deletions
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 060b3934a41..042630121c8 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -20,12 +20,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
                                                                 int node_addr,
                                                                 int child)
 {
-	Transform space;
-	const int child_addr = node_addr + child * 3;
-	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
-	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
-	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
-	return space;
+  Transform space;
+  const int child_addr = node_addr + child * 3;
+  space.x = kernel_tex_fetch(__bvh_nodes, child_addr + 1);
+  space.y = kernel_tex_fetch(__bvh_nodes, child_addr + 2);
+  space.z = kernel_tex_fetch(__bvh_nodes, child_addr + 3);
+  return space;
 }
 
 #if !defined(__KERNEL_SSE2__)
@@ -38,42 +38,41 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
                                                       float dist[2])
 {
 
-	/* fetch node data */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
-	float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
-	float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
-
-	/* intersect ray against child nodes */
-	float c0lox = (node0.x - P.x) * idir.x;
-	float c0hix = (node0.z - P.x) * idir.x;
-	float c0loy = (node1.x - P.y) * idir.y;
-	float c0hiy = (node1.z - P.y) * idir.y;
-	float c0loz = (node2.x - P.z) * idir.z;
-	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
-	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
-
-	float c1lox = (node0.y - P.x) * idir.x;
-	float c1hix = (node0.w - P.x) * idir.x;
-	float c1loy = (node1.y - P.y) * idir.y;
-	float c1hiy = (node1.w - P.y) * idir.y;
-	float c1loz = (node2.y - P.z) * idir.z;
-	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
-	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
-
-	dist[0] = c0min;
-	dist[1] = c1min;
-
-#ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-#else
-	return ((c0max >= c0min)? 1: 0) |
-	       ((c1max >= c1min)? 2: 0);
-#endif
+  /* fetch node data */
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
+  float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
+  float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
+
+  /* intersect ray against child nodes */
+  float c0lox = (node0.x - P.x) * idir.x;
+  float c0hix = (node0.z - P.x) * idir.x;
+  float c0loy = (node1.x - P.y) * idir.y;
+  float c0hiy = (node1.z - P.y) * idir.y;
+  float c0loz = (node2.x - P.z) * idir.z;
+  float c0hiz = (node2.z - P.z) * idir.z;
+  float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+  float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
+
+  float c1lox = (node0.y - P.x) * idir.x;
+  float c1hix = (node0.w - P.x) * idir.x;
+  float c1loy = (node1.y - P.y) * idir.y;
+  float c1hiy = (node1.w - P.y) * idir.y;
+  float c1loz = (node2.y - P.z) * idir.z;
+  float c1hiz = (node2.w - P.z) * idir.z;
+  float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+  float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
+
+  dist[0] = c0min;
+  dist[1] = c1min;
+
+#  ifdef __VISIBILITY_FLAG__
+  /* this visibility test gives a 5% performance hit, how to solve? */
+  return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
+         (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
+#  else
+  return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0);
+#  endif
 }
 
 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
@@ -87,118 +86,115 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
                                                              float dist[2])
 {
 
-	/* fetch node data */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
-	float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
-	float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
-
-	/* intersect ray against child nodes */
-	float c0lox = (node0.x - P.x) * idir.x;
-	float c0hix = (node0.z - P.x) * idir.x;
-	float c0loy = (node1.x - P.y) * idir.y;
-	float c0hiy = (node1.z - P.y) * idir.y;
-	float c0loz = (node2.x - P.z) * idir.z;
-	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
-	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
-
-	float c1lox = (node0.y - P.x) * idir.x;
-	float c1hix = (node0.w - P.x) * idir.x;
-	float c1loy = (node1.y - P.y) * idir.y;
-	float c1hiy = (node1.w - P.y) * idir.y;
-	float c1loz = (node2.y - P.z) * idir.z;
-	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
-	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
-
-	if(difl != 0.0f) {
-		float hdiff = 1.0f + difl;
-		float ldiff = 1.0f - difl;
-		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-			c0min = max(ldiff * c0min, c0min - extmax);
-			c0max = min(hdiff * c0max, c0max + extmax);
-		}
-		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-			c1min = max(ldiff * c1min, c1min - extmax);
-			c1max = min(hdiff * c1max, c1max + extmax);
-		}
-	}
-
-	dist[0] = c0min;
-	dist[1] = c1min;
-
-#ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-#else
-	return ((c0max >= c0min)? 1: 0) |
-	       ((c1max >= c1min)? 2: 0);
-#endif
+  /* fetch node data */
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
+  float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
+  float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
+
+  /* intersect ray against child nodes */
+  float c0lox = (node0.x - P.x) * idir.x;
+  float c0hix = (node0.z - P.x) * idir.x;
+  float c0loy = (node1.x - P.y) * idir.y;
+  float c0hiy = (node1.z - P.y) * idir.y;
+  float c0loz = (node2.x - P.z) * idir.z;
+  float c0hiz = (node2.z - P.z) * idir.z;
+  float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+  float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
+
+  float c1lox = (node0.y - P.x) * idir.x;
+  float c1hix = (node0.w - P.x) * idir.x;
+  float c1loy = (node1.y - P.y) * idir.y;
+  float c1hiy = (node1.w - P.y) * idir.y;
+  float c1loz = (node2.y - P.z) * idir.z;
+  float c1hiz = (node2.w - P.z) * idir.z;
+  float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+  float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
+
+  if (difl != 0.0f) {
+    float hdiff = 1.0f + difl;
+    float ldiff = 1.0f - difl;
+    if (__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+      c0min = max(ldiff * c0min, c0min - extmax);
+      c0max = min(hdiff * c0max, c0max + extmax);
+    }
+    if (__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+      c1min = max(ldiff * c1min, c1min - extmax);
+      c1max = min(hdiff * c1max, c1max + extmax);
+    }
+  }
+
+  dist[0] = c0min;
+  dist[1] = c1min;
+
+#  ifdef __VISIBILITY_FLAG__
+  /* this visibility test gives a 5% performance hit, how to solve? */
+  return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
+         (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
+#  else
+  return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0);
+#  endif
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
-        KernelGlobals *kg,
-        const float3 P,
-        const float3 dir,
-        const float t,
-        int node_addr,
-        int child,
-        float dist[2])
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const float t,
+                                                               int node_addr,
+                                                               int child,
+                                                               float dist[2])
 {
-	Transform space  = bvh_unaligned_node_fetch_space(kg, node_addr, child);
-	float3 aligned_dir = transform_direction(&space, dir);
-	float3 aligned_P = transform_point(&space, P);
-	float3 nrdir = -bvh_inverse_direction(aligned_dir);
-	float3 lower_xyz = aligned_P * nrdir;
-	float3 upper_xyz = lower_xyz - nrdir;
-	const float near_x = min(lower_xyz.x, upper_xyz.x);
-	const float near_y = min(lower_xyz.y, upper_xyz.y);
-	const float near_z = min(lower_xyz.z, upper_xyz.z);
-	const float far_x  = max(lower_xyz.x, upper_xyz.x);
-	const float far_y  = max(lower_xyz.y, upper_xyz.y);
-	const float far_z  = max(lower_xyz.z, upper_xyz.z);
-	const float tnear   = max4(0.0f, near_x, near_y, near_z);
-	const float tfar    = min4(t, far_x, far_y, far_z);
-	*dist = tnear;
-	return tnear <= tfar;
+  Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child);
+  float3 aligned_dir = transform_direction(&space, dir);
+  float3 aligned_P = transform_point(&space, P);
+  float3 nrdir = -bvh_inverse_direction(aligned_dir);
+  float3 lower_xyz = aligned_P * nrdir;
+  float3 upper_xyz = lower_xyz - nrdir;
+  const float near_x = min(lower_xyz.x, upper_xyz.x);
+  const float near_y = min(lower_xyz.y, upper_xyz.y);
+  const float near_z = min(lower_xyz.z, upper_xyz.z);
+  const float far_x = max(lower_xyz.x, upper_xyz.x);
+  const float far_y = max(lower_xyz.y, upper_xyz.y);
+  const float far_z = max(lower_xyz.z, upper_xyz.z);
+  const float tnear = max4(0.0f, near_x, near_y, near_z);
+  const float tfar = min4(t, far_x, far_y, far_z);
+  *dist = tnear;
+  return tnear <= tfar;
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
-        KernelGlobals *kg,
-        const float3 P,
-        const float3 dir,
-        const float t,
-        const float difl,
-        int node_addr,
-        int child,
-        float dist[2])
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(KernelGlobals *kg,
+                                                                      const float3 P,
+                                                                      const float3 dir,
+                                                                      const float t,
+                                                                      const float difl,
+                                                                      int node_addr,
+                                                                      int child,
+                                                                      float dist[2])
 {
-	Transform space  = bvh_unaligned_node_fetch_space(kg, node_addr, child);
-	float3 aligned_dir = transform_direction(&space, dir);
-	float3 aligned_P = transform_point(&space, P);
-	float3 nrdir = -bvh_inverse_direction(aligned_dir);
-	float3 tLowerXYZ = aligned_P * nrdir;
-	float3 tUpperXYZ = tLowerXYZ - nrdir;
-	const float near_x = min(tLowerXYZ.x, tUpperXYZ.x);
-	const float near_y = min(tLowerXYZ.y, tUpperXYZ.y);
-	const float near_z = min(tLowerXYZ.z, tUpperXYZ.z);
-	const float far_x  = max(tLowerXYZ.x, tUpperXYZ.x);
-	const float far_y  = max(tLowerXYZ.y, tUpperXYZ.y);
-	const float far_z  = max(tLowerXYZ.z, tUpperXYZ.z);
-	const float tnear   = max4(0.0f, near_x, near_y, near_z);
-	const float tfar    = min4(t, far_x, far_y, far_z);
-	*dist = tnear;
-	if(difl != 0.0f) {
-		/* TODO(sergey): Same as for QBVH, needs a proper use. */
-		const float round_down = 1.0f - difl;
-		const float round_up = 1.0f + difl;
-		return round_down*tnear <= round_up*tfar;
-	}
-	else {
-		return tnear <= tfar;
-	}
+  Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child);
+  float3 aligned_dir = transform_direction(&space, dir);
+  float3 aligned_P = transform_point(&space, P);
+  float3 nrdir = -bvh_inverse_direction(aligned_dir);
+  float3 tLowerXYZ = aligned_P * nrdir;
+  float3 tUpperXYZ = tLowerXYZ - nrdir;
+  const float near_x = min(tLowerXYZ.x, tUpperXYZ.x);
+  const float near_y = min(tLowerXYZ.y, tUpperXYZ.y);
+  const float near_z = min(tLowerXYZ.z, tUpperXYZ.z);
+  const float far_x = max(tLowerXYZ.x, tUpperXYZ.x);
+  const float far_y = max(tLowerXYZ.y, tUpperXYZ.y);
+  const float far_z = max(tLowerXYZ.z, tUpperXYZ.z);
+  const float tnear = max4(0.0f, near_x, near_y, near_z);
+  const float tfar = min4(t, far_x, far_y, far_z);
+  *dist = tnear;
+  if (difl != 0.0f) {
+    /* TODO(sergey): Same as for QBVH, needs a proper use. */
+    const float round_down = 1.0f - difl;
+    const float round_up = 1.0f + difl;
+    return round_down * tnear <= round_up * tfar;
+  }
+  else {
+    return tnear <= tfar;
+  }
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
@@ -210,25 +206,25 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                         const uint visibility,
                                                         float dist[2])
 {
-	int mask = 0;
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.x) & visibility))
-#endif
-		{
-			mask |= 1;
-		}
-	}
-	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.y) & visibility))
-#endif
-		{
-			mask |= 2;
-		}
-	}
-	return mask;
+  int mask = 0;
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
+#  ifdef __VISIBILITY_FLAG__
+    if ((__float_as_uint(cnodes.x) & visibility))
+#  endif
+    {
+      mask |= 1;
+    }
+  }
+  if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
+#  ifdef __VISIBILITY_FLAG__
+    if ((__float_as_uint(cnodes.y) & visibility))
+#  endif
+    {
+      mask |= 2;
+    }
+  }
+  return mask;
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
@@ -242,25 +238,25 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
                                                                const uint visibility,
                                                                float dist[2])
 {
-	int mask = 0;
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.x) & visibility))
-#endif
-		{
-			mask |= 1;
-		}
-	}
-	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.y) & visibility))
-#endif
-		{
-			mask |= 2;
-		}
-	}
-	return mask;
+  int mask = 0;
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) {
+#  ifdef __VISIBILITY_FLAG__
+    if ((__float_as_uint(cnodes.x) & visibility))
+#  endif
+    {
+      mask |= 1;
+    }
+  }
+  if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) {
+#  ifdef __VISIBILITY_FLAG__
+    if ((__float_as_uint(cnodes.y) & visibility))
+#  endif
+    {
+      mask |= 2;
+    }
+  }
+  return mask;
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
@@ -272,26 +268,13 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                               const uint visibility,
                                               float dist[2])
 {
-	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect(kg,
-		                                    P,
-		                                    dir,
-		                                    idir,
-		                                    t,
-		                                    node_addr,
-		                                    visibility,
-		                                    dist);
-	}
-	else {
-		return bvh_aligned_node_intersect(kg,
-		                                  P,
-		                                  idir,
-		                                  t,
-		                                  node_addr,
-		                                  visibility,
-		                                  dist);
-	}
+  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+    return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist);
+  }
+  else {
+    return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist);
+  }
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
@@ -305,279 +288,244 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                      const uint visibility,
                                                      float dist[2])
 {
-	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect_robust(kg,
-		                                           P,
-		                                           dir,
-		                                           idir,
-		                                           t,
-		                                           difl,
-		                                           extmax,
-		                                           node_addr,
-		                                           visibility,
-		                                           dist);
-	}
-	else {
-		return bvh_aligned_node_intersect_robust(kg,
-		                                         P,
-		                                         idir,
-		                                         t,
-		                                         difl,
-		                                         extmax,
-		                                         node_addr,
-		                                         visibility,
-		                                         dist);
-	}
+  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+    return bvh_unaligned_node_intersect_robust(
+        kg, P, dir, idir, t, difl, extmax, node_addr, visibility, dist);
+  }
+  else {
+    return bvh_aligned_node_intersect_robust(
+        kg, P, idir, t, difl, extmax, node_addr, visibility, dist);
+  }
 }
-#else  /* !defined(__KERNEL_SSE2__) */
-
-int ccl_device_forceinline bvh_aligned_node_intersect(
-        KernelGlobals *kg,
-        const float3& P,
-        const float3& dir,
-        const ssef& tsplat,
-        const ssef Psplat[3],
-        const ssef idirsplat[3],
-        const shuffle_swap_t shufflexyz[3],
-        const int node_addr,
-        const uint visibility,
-        float dist[2])
+#else /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg,
+                                                      const float3 &P,
+                                                      const float3 &dir,
+                                                      const ssef &tsplat,
+                                                      const ssef Psplat[3],
+                                                      const ssef idirsplat[3],
+                                                      const shuffle_swap_t shufflexyz[3],
+                                                      const int node_addr,
+                                                      const uint visibility,
+                                                      float dist[2])
 {
-	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+  /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 
-	/* fetch node data */
-	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr;
+  /* fetch node data */
+  const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr;
 
-	/* intersect ray against child nodes */
-	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+  /* intersect ray against child nodes */
+  const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+  const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+  const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
-	/* calculate { c0min, c1min, -c0max, -c1max} */
-	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-	const ssef tminmax = minmax ^ pn;
-	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+  /* calculate { c0min, c1min, -c0max, -c1max} */
+  ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+  const ssef tminmax = minmax ^ pn;
+  const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
-	dist[0] = tminmax[0];
-	dist[1] = tminmax[1];
+  dist[0] = tminmax[0];
+  dist[1] = tminmax[1];
 
-	int mask = movemask(lrhit);
+  int mask = movemask(lrhit);
 
 #  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
+  /* this visibility test gives a 5% performance hit, how to solve? */
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
+              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
+  return cmask;
 #  else
-	return mask & 3;
+  return mask & 3;
 #  endif
 }
 
-ccl_device_forceinline int bvh_aligned_node_intersect_robust(
-        KernelGlobals *kg,
-        const float3& P,
-        const float3& dir,
-        const ssef& tsplat,
-        const ssef Psplat[3],
-        const ssef idirsplat[3],
-        const shuffle_swap_t shufflexyz[3],
-        const float difl,
-        const float extmax,
-        const int nodeAddr,
-        const uint visibility,
-        float dist[2])
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+                                                             const float3 &P,
+                                                             const float3 &dir,
+                                                             const ssef &tsplat,
+                                                             const ssef Psplat[3],
+                                                             const ssef idirsplat[3],
+                                                             const shuffle_swap_t shufflexyz[3],
+                                                             const float difl,
+                                                             const float extmax,
+                                                             const int nodeAddr,
+                                                             const uint visibility,
+                                                             float dist[2])
 {
-	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-
-	/* fetch node data */
-	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-
-	/* intersect ray against child nodes */
-	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-	/* calculate { c0min, c1min, -c0max, -c1max} */
-	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-	const ssef tminmax = minmax ^ pn;
-
-	if(difl != 0.0f) {
-		float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-		float4 *tminmaxview = (float4*)&tminmax;
-		float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
-		float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
-		float hdiff = 1.0f + difl;
-		float ldiff = 1.0f - difl;
-		if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
-			c0min = max(ldiff * c0min, c0min - extmax);
-			c0max = min(hdiff * c0max, c0max + extmax);
-		}
-		if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
-			c1min = max(ldiff * c1min, c1min - extmax);
-			c1max = min(hdiff * c1max, c1max + extmax);
-		}
-	}
-
-	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-	dist[0] = tminmax[0];
-	dist[1] = tminmax[1];
-
-	int mask = movemask(lrhit);
+  /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+  /* fetch node data */
+  const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + nodeAddr;
+
+  /* intersect ray against child nodes */
+  const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+  const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+  const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+  /* calculate { c0min, c1min, -c0max, -c1max} */
+  ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+  const ssef tminmax = minmax ^ pn;
+
+  if (difl != 0.0f) {
+    float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0);
+    float4 *tminmaxview = (float4 *)&tminmax;
+    float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
+    float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
+    float hdiff = 1.0f + difl;
+    float ldiff = 1.0f - difl;
+    if (__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+      c0min = max(ldiff * c0min, c0min - extmax);
+      c0max = min(hdiff * c0max, c0max + extmax);
+    }
+    if (__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+      c1min = max(ldiff * c1min, c1min - extmax);
+      c1max = min(hdiff * c1max, c1max + extmax);
+    }
+  }
+
+  const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+  dist[0] = tminmax[0];
+  dist[1] = tminmax[1];
+
+  int mask = movemask(lrhit);
 
 #  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
+  /* this visibility test gives a 5% performance hit, how to solve? */
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0);
+  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
+              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
+  return cmask;
 #  else
-	return mask & 3;
+  return mask & 3;
 #  endif
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 dir,
-                                                        const ssef& isect_near,
-                                                        const ssef& isect_far,
+                                                        const ssef &isect_near,
+                                                        const ssef &isect_far,
                                                         const int node_addr,
                                                         const uint visibility,
                                                         float dist[2])
 {
-	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
-	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
-
-	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);
-	float3 aligned_P0 = transform_point(&space0, P),
-	       aligned_P1 = transform_point(&space1, P);
-	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-	ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
-	                    aligned_P1.x * nrdir1.x,
-	                    0.0f, 0.0f),
-	     lower_y = ssef(aligned_P0.y * nrdir0.y,
-	                    aligned_P1.y * nrdir1.y,
-	                    0.0f,
-	                    0.0f),
-	     lower_z = ssef(aligned_P0.z * nrdir0.z,
-	                    aligned_P1.z * nrdir1.z,
-	                    0.0f,
-	                    0.0f);
-
-	ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-	     upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-	     upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-	ssef tnear_x = min(lower_x, upper_x);
-	ssef tnear_y = min(lower_y, upper_y);
-	ssef tnear_z = min(lower_z, upper_z);
-	ssef tfar_x = max(lower_x, upper_x);
-	ssef tfar_y = max(lower_y, upper_y);
-	ssef tfar_z = max(lower_z, upper_z);
-
-	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-	sseb vmask = tnear <= tfar;
-	dist[0] = tnear.f[0];
-	dist[1] = tnear.f[1];
-
-	int mask = (int)movemask(vmask);
+  Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
+  Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
+
+  float3 aligned_dir0 = transform_direction(&space0, dir),
+         aligned_dir1 = transform_direction(&space1, dir);
+  float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P);
+  float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+         nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+  ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f),
+       lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f),
+       lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f);
+
+  ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+       upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+       upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+  ssef tnear_x = min(lower_x, upper_x);
+  ssef tnear_y = min(lower_y, upper_y);
+  ssef tnear_z = min(lower_z, upper_z);
+  ssef tfar_x = max(lower_x, upper_x);
+  ssef tfar_y = max(lower_y, upper_y);
+  ssef tfar_z = max(lower_z, upper_z);
+
+  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+  sseb vmask = tnear <= tfar;
+  dist[0] = tnear.f[0];
+  dist[1] = tnear.f[1];
+
+  int mask = (int)movemask(vmask);
 
 #  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
+  /* this visibility test gives a 5% performance hit, how to solve? */
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
+              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
+  return cmask;
 #  else
-	return mask & 3;
+  return mask & 3;
 #  endif
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                                const float3 P,
                                                                const float3 dir,
-                                                               const ssef& isect_near,
-                                                               const ssef& isect_far,
+                                                               const ssef &isect_near,
+                                                               const ssef &isect_far,
                                                                const float difl,
                                                                const int node_addr,
                                                                const uint visibility,
                                                                float dist[2])
 {
-	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
-	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
-
-	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);
-	float3 aligned_P0 = transform_point(&space0, P),
-	       aligned_P1 = transform_point(&space1, P);
-	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-	ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
-	                    aligned_P1.x * nrdir1.x,
-	                    0.0f, 0.0f),
-	     lower_y = ssef(aligned_P0.y * nrdir0.y,
-	                    aligned_P1.y * nrdir1.y,
-	                    0.0f,
-	                    0.0f),
-	     lower_z = ssef(aligned_P0.z * nrdir0.z,
-	                    aligned_P1.z * nrdir1.z,
-	                    0.0f,
-	                    0.0f);
-
-	ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-	     upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-	     upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-	ssef tnear_x = min(lower_x, upper_x);
-	ssef tnear_y = min(lower_y, upper_y);
-	ssef tnear_z = min(lower_z, upper_z);
-	ssef tfar_x = max(lower_x, upper_x);
-	ssef tfar_y = max(lower_y, upper_y);
-	ssef tfar_z = max(lower_z, upper_z);
-
-	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-	sseb vmask;
-	if(difl != 0.0f) {
-		const float round_down = 1.0f - difl;
-		const float round_up = 1.0f + difl;
-		vmask = round_down*tnear <= round_up*tfar;
-	}
-	else {
-		vmask = tnear <= tfar;
-	}
-
-	dist[0] = tnear.f[0];
-	dist[1] = tnear.f[1];
-
-	int mask = (int)movemask(vmask);
+  Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
+  Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
+
+  float3 aligned_dir0 = transform_direction(&space0, dir),
+         aligned_dir1 = transform_direction(&space1, dir);
+  float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P);
+  float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+         nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+  ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f),
+       lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f),
+       lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f);
+
+  ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+       upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+       upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+  ssef tnear_x = min(lower_x, upper_x);
+  ssef tnear_y = min(lower_y, upper_y);
+  ssef tnear_z = min(lower_z, upper_z);
+  ssef tfar_x = max(lower_x, upper_x);
+  ssef tfar_y = max(lower_y, upper_y);
+  ssef tfar_z = max(lower_z, upper_z);
+
+  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+  sseb vmask;
+  if (difl != 0.0f) {
+    const float round_down = 1.0f - difl;
+    const float round_up = 1.0f + difl;
+    vmask = round_down * tnear <= round_up * tfar;
+  }
+  else {
+    vmask = tnear <= tfar;
+  }
+
+  dist[0] = tnear.f[0];
+  dist[1] = tnear.f[1];
+
+  int mask = (int)movemask(vmask);
 
 #  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
+  /* this visibility test gives a 5% performance hit, how to solve? */
+  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
+              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
+  return cmask;
 #  else
-	return mask & 3;
+  return mask & 3;
 #  endif
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                              const float3& P,
-                                              const float3& dir,
-                                              const ssef& isect_near,
-                                              const ssef& isect_far,
-                                              const ssef& tsplat,
+                                              const float3 &P,
+                                              const float3 &dir,
+                                              const ssef &isect_near,
+                                              const ssef &isect_far,
+                                              const ssef &tsplat,
                                               const ssef Psplat[3],
                                               const ssef idirsplat[3],
                                               const shuffle_swap_t shufflexyz[3],
@@ -585,37 +533,23 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                               const uint visibility,
                                               float dist[2])
 {
-	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect(kg,
-		                                    P,
-		                                    dir,
-		                                    isect_near,
-		                                    isect_far,
-		                                    node_addr,
-		                                    visibility,
-		                                    dist);
-	}
-	else {
-		return bvh_aligned_node_intersect(kg,
-		                                  P,
-		                                  dir,
-		                                  tsplat,
-		                                  Psplat,
-		                                  idirsplat,
-		                                  shufflexyz,
-		                                  node_addr,
-		                                  visibility,
-		                                  dist);
-	}
+  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+    return bvh_unaligned_node_intersect(
+        kg, P, dir, isect_near, isect_far, node_addr, visibility, dist);
+  }
+  else {
+    return bvh_aligned_node_intersect(
+        kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist);
+  }
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                     const float3& P,
-                                                     const float3& dir,
-                                                     const ssef& isect_near,
-                                                     const ssef& isect_far,
-                                                     const ssef& tsplat,
+                                                     const float3 &P,
+                                                     const float3 &dir,
+                                                     const ssef &isect_near,
+                                                     const ssef &isect_far,
+                                                     const ssef &tsplat,
                                                      const ssef Psplat[3],
                                                      const ssef idirsplat[3],
                                                      const shuffle_swap_t shufflexyz[3],
@@ -625,31 +559,24 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                      const uint visibility,
                                                      float dist[2])
 {
-	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect_robust(kg,
-		                                           P,
-		                                           dir,
-		                                           isect_near,
-		                                           isect_far,
-		                                           difl,
-		                                           node_addr,
-		                                           visibility,
-		                                           dist);
-	}
-	else {
-		return bvh_aligned_node_intersect_robust(kg,
-		                                         P,
-		                                         dir,
-		                                         tsplat,
-		                                         Psplat,
-		                                         idirsplat,
-		                                         shufflexyz,
-		                                         difl,
-		                                         extmax,
-		                                         node_addr,
-		                                         visibility,
-		                                         dist);
-	}
+  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+    return bvh_unaligned_node_intersect_robust(
+        kg, P, dir, isect_near, isect_far, difl, node_addr, visibility, dist);
+  }
+  else {
+    return bvh_aligned_node_intersect_robust(kg,
+                                             P,
+                                             dir,
+                                             tsplat,
+                                             Psplat,
+                                             idirsplat,
+                                             shufflexyz,
+                                             difl,
+                                             extmax,
+                                             node_addr,
+                                             visibility,
+                                             dist);
+  }
 }
-#endif  /* !defined(__KERNEL_SSE2__) */
+#endif /* !defined(__KERNEL_SSE2__) */