Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/CMakeLists.txt1
-rw-r--r--intern/cycles/kernel/geom/geom_bvh.h18
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_nodes.h659
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_shadow.h132
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_subsurface.h118
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_traversal.h207
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_volume.h123
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_volume_all.h123
-rw-r--r--intern/cycles/kernel/geom/geom_qbvh.h344
-rw-r--r--intern/cycles/kernel/geom/geom_qbvh_shadow.h73
-rw-r--r--intern/cycles/kernel/geom/geom_qbvh_subsurface.h57
-rw-r--r--intern/cycles/kernel/geom/geom_qbvh_traversal.h106
-rw-r--r--intern/cycles/kernel/geom/geom_qbvh_volume.h73
-rw-r--r--intern/cycles/kernel/geom/geom_qbvh_volume_all.h73
14 files changed, 1574 insertions, 533 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index f0adbc03e22..3c2f7747f34 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -141,6 +141,7 @@ set(SRC_GEOM_HEADERS
geom/geom.h
geom/geom_attribute.h
geom/geom_bvh.h
+ geom/geom_bvh_nodes.h
geom/geom_bvh_shadow.h
geom/geom_bvh_subsurface.h
geom/geom_bvh_traversal.h
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index d0eedd3396a..f8d563f0afa 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -77,6 +77,8 @@ CCL_NAMESPACE_BEGIN
/* Regular BVH traversal */
+#include "geom_bvh_nodes.h"
+
#define BVH_FUNCTION_NAME bvh_intersect
#define BVH_FUNCTION_FEATURES 0
#include "geom_bvh_traversal.h"
@@ -109,13 +111,13 @@ CCL_NAMESPACE_BEGIN
#if defined(__SUBSURFACE__)
# define BVH_FUNCTION_NAME bvh_intersect_subsurface
-# define BVH_FUNCTION_FEATURES 0
+# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "geom_bvh_subsurface.h"
#endif
#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-# define BVH_FUNCTION_FEATURES BVH_MOTION
+# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
# include "geom_bvh_subsurface.h"
#endif
@@ -123,19 +125,19 @@ CCL_NAMESPACE_BEGIN
#if defined(__VOLUME__)
# define BVH_FUNCTION_NAME bvh_intersect_volume
-# define BVH_FUNCTION_FEATURES 0
+# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "geom_bvh_volume.h"
#endif
#if defined(__VOLUME__) && defined(__INSTANCING__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-# define BVH_FUNCTION_FEATURES BVH_INSTANCING
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
# include "geom_bvh_volume.h"
#endif
#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
# include "geom_bvh_volume.h"
#endif
@@ -175,19 +177,19 @@ CCL_NAMESPACE_BEGIN
#if defined(__VOLUME_RECORD_ALL__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all
-# define BVH_FUNCTION_FEATURES 0
+# define BVH_FUNCTION_FEATURES BVH_HAIR
# include "geom_bvh_volume_all.h"
#endif
#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-# define BVH_FUNCTION_FEATURES BVH_INSTANCING
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
# include "geom_bvh_volume_all.h"
#endif
#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
# define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
# include "geom_bvh_volume_all.h"
#endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_nodes.h b/intern/cycles/kernel/geom/geom_bvh_nodes.h
new file mode 100644
index 00000000000..deb91ec95f5
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_nodes.h
@@ -0,0 +1,659 @@
+/*
+ * Copyright 2011-2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
+// 3-vector which might be faster.
+ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ int nodeAddr,
+ int child)
+{
+ Transform space;
+ const int child_addr = nodeAddr + child * 3;
+ space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
+ space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
+ space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
+ space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+ return space;
+}
+
+#if !defined(__KERNEL_SSE2__)
+ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 idir,
+ const float t,
+ const int nodeAddr,
+ const uint visibility,
+ float *dist)
+{
+
+ /* fetch node data */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+ float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+ float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+ /* intersect ray against child nodes */
+ float c0lox = (node0.x - P.x) * idir.x;
+ float c0hix = (node0.z - P.x) * idir.x;
+ float c0loy = (node1.x - P.y) * idir.y;
+ float c0hiy = (node1.z - P.y) * idir.y;
+ float c0loz = (node2.x - P.z) * idir.z;
+ float c0hiz = (node2.z - P.z) * idir.z;
+ float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+ float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+ float c1lox = (node0.y - P.x) * idir.x;
+ float c1hix = (node0.w - P.x) * idir.x;
+ float c1loy = (node1.y - P.y) * idir.y;
+ float c1hiy = (node1.w - P.y) * idir.y;
+ float c1loz = (node2.y - P.z) * idir.z;
+ float c1hiz = (node2.w - P.z) * idir.z;
+ float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+ float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+ dist[0] = c0min;
+ dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+ return ((c0max >= c0min)? 1: 0) |
+ ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 idir,
+ const float t,
+ const float difl,
+ const float extmax,
+ const int nodeAddr,
+ const uint visibility,
+ float *dist)
+{
+
+ /* fetch node data */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+ float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+ float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+ /* intersect ray against child nodes */
+ float c0lox = (node0.x - P.x) * idir.x;
+ float c0hix = (node0.z - P.x) * idir.x;
+ float c0loy = (node1.x - P.y) * idir.y;
+ float c0hiy = (node1.z - P.y) * idir.y;
+ float c0loz = (node2.x - P.z) * idir.z;
+ float c0hiz = (node2.z - P.z) * idir.z;
+ float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+ float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+ float c1lox = (node0.y - P.x) * idir.x;
+ float c1hix = (node0.w - P.x) * idir.x;
+ float c1loy = (node1.y - P.y) * idir.y;
+ float c1hiy = (node1.w - P.y) * idir.y;
+ float c1loz = (node2.y - P.z) * idir.z;
+ float c1hiz = (node2.w - P.z) * idir.z;
+ float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+ float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+ if(difl != 0.0f) {
+ float hdiff = 1.0f + difl;
+ float ldiff = 1.0f - difl;
+ if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+ c0min = max(ldiff * c0min, c0min - extmax);
+ c0max = min(hdiff * c0max, c0max + extmax);
+ }
+ if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+ c1min = max(ldiff * c1min, c1min - extmax);
+ c1max = min(hdiff * c1max, c1max + extmax);
+ }
+ }
+
+ dist[0] = c0min;
+ dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+ return ((c0max >= c0min)? 1: 0) |
+ ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float t,
+ int nodeAddr,
+ int child,
+ float *dist)
+{
+ Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+ float3 aligned_dir = transform_direction(&space, dir);
+ float3 aligned_P = transform_point(&space, P);
+ float3 nrdir = -bvh_inverse_direction(aligned_dir);
+ float3 tLowerXYZ = aligned_P * nrdir;
+ float3 tUpperXYZ = tLowerXYZ - nrdir;
+ const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+ const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+ const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+ const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
+ const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
+ const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
+ const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
+ const float tFar = min4(t, tFarX, tFarY, tFarZ);
+ *dist = tNear;
+ return tNear <= tFar;
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float t,
+ const float difl,
+ const float /*extmax*/,
+ int nodeAddr,
+ int child,
+ float *dist)
+{
+ Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+ float3 aligned_dir = transform_direction(&space, dir);
+ float3 aligned_P = transform_point(&space, P);
+ float3 nrdir = -bvh_inverse_direction(aligned_dir);
+ float3 tLowerXYZ = aligned_P * nrdir;
+ float3 tUpperXYZ = tLowerXYZ - nrdir;
+ const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+ const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+ const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+ const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x);
+ const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y);
+ const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z);
+ const float tNear = max4(0.0f, tNearX, tNearY, tNearZ);
+ const float tFar = min4(t, tFarX, tFarY, tFarZ);
+ *dist = tNear;
+ if(difl != 0.0f) {
+ /* TODO(sergey): Same as for QBVH, needs a proper use. */
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+ return round_down*tNear <= round_up*tFar;
+ }
+ else {
+ return tNear <= tFar;
+ }
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const int nodeAddr,
+ const uint visibility,
+ float *dist)
+{
+ int mask = 0;
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.x) & visibility))
+#endif
+ {
+ mask |= 1;
+ }
+ }
+ if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.y) & visibility))
+#endif
+ {
+ mask |= 2;
+ }
+ }
+ return mask;
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const float difl,
+ const float extmax,
+ const int nodeAddr,
+ const uint visibility,
+ float *dist)
+{
+ int mask = 0;
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.x) & visibility))
+#endif
+ {
+ mask |= 1;
+ }
+ }
+ if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(cnodes.y) & visibility))
+#endif
+ {
+ mask |= 2;
+ }
+ }
+ return mask;
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect(kg,
+ P,
+ dir,
+ idir,
+ t,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect(kg,
+ P,
+ idir,
+ t,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const float3 idir,
+ const float t,
+ const float difl,
+ const float extmax,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect_robust(kg,
+ P,
+ dir,
+ idir,
+ t,
+ difl,
+ extmax,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect_robust(kg,
+ P,
+ idir,
+ t,
+ difl,
+ extmax,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+}
+#else /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_inline bvh_aligned_node_intersect(
+ KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+ /* fetch node data */
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+ /* intersect ray against child nodes */
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+ dist[0] = tminmax[0];
+ dist[1] = tminmax[1];
+
+ int mask = movemask(lrhit);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+int ccl_device_inline bvh_aligned_node_intersect_robust(
+ KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const float difl,
+ const float extmax,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+ /* fetch node data */
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+ /* intersect ray against child nodes */
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+
+ if(difl != 0.0f) {
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ float4 *tminmaxview = (float4*)&tminmax;
+ float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
+ float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
+ float hdiff = 1.0f + difl;
+ float ldiff = 1.0f - difl;
+ if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+ c0min = max(ldiff * c0min, c0min - extmax);
+ c0max = min(hdiff * c0max, c0max + extmax);
+ }
+ if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+ c1min = max(ldiff * c1min, c1min - extmax);
+ c1max = min(hdiff * c1max, c1max + extmax);
+ }
+ }
+
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+ dist[0] = tminmax[0];
+ dist[1] = tminmax[1];
+
+ int mask = movemask(lrhit);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const ssef& tnear,
+ const ssef& tfar,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+ Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+ float3 aligned_dir0 = transform_direction(&space0, dir),
+ aligned_dir1 = transform_direction(&space1, dir);;
+ float3 aligned_P0 = transform_point(&space0, P),
+ aligned_P1 = transform_point(&space1, P);
+ float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+ nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+ ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+ aligned_P1.x * nrdir1.x,
+ 0.0f, 0.0f),
+ tLowerY = ssef(aligned_P0.y * nrdir0.y,
+ aligned_P1.y * nrdir1.y,
+ 0.0f,
+ 0.0f),
+ tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+ aligned_P1.z * nrdir1.z,
+ 0.0f,
+ 0.0f);
+
+ ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+ tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+ tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+ ssef tnear_x = min(tLowerX, tUpperX);
+ ssef tnear_y = min(tLowerY, tUpperY);
+ ssef tnear_z = min(tLowerZ, tUpperZ);
+ ssef tfar_x = max(tLowerX, tUpperX);
+ ssef tfar_y = max(tLowerY, tUpperY);
+ ssef tfar_z = max(tLowerZ, tUpperZ);
+
+ const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+ const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+ sseb vmask = tNear <= tFar;
+ dist[0] = tNear.f[0];
+ dist[1] = tNear.f[1];
+
+ int mask = (int)movemask(vmask);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ const float3 P,
+ const float3 dir,
+ const ssef& tnear,
+ const ssef& tfar,
+ const float difl,
+ const float /*extmax*/,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+ Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+ float3 aligned_dir0 = transform_direction(&space0, dir),
+ aligned_dir1 = transform_direction(&space1, dir);;
+ float3 aligned_P0 = transform_point(&space0, P),
+ aligned_P1 = transform_point(&space1, P);
+ float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+ nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+ ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+ aligned_P1.x * nrdir1.x,
+ 0.0f, 0.0f),
+ tLowerY = ssef(aligned_P0.y * nrdir0.y,
+ aligned_P1.y * nrdir1.y,
+ 0.0f,
+ 0.0f),
+ tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+ aligned_P1.z * nrdir1.z,
+ 0.0f,
+ 0.0f);
+
+ ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+ tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+ tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+ ssef tnear_x = min(tLowerX, tUpperX);
+ ssef tnear_y = min(tLowerY, tUpperY);
+ ssef tnear_z = min(tLowerZ, tUpperZ);
+ ssef tfar_x = max(tLowerX, tUpperX);
+ ssef tfar_y = max(tLowerY, tUpperY);
+ ssef tfar_z = max(tLowerZ, tUpperZ);
+
+ const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+ const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+ sseb vmask;
+ if(difl != 0.0f) {
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+ vmask = round_down*tNear <= round_up*tFar;
+ }
+ else {
+ vmask = tNear <= tFar;
+ }
+
+ dist[0] = tNear.f[0];
+ dist[1] = tNear.f[1];
+
+ int mask = (int)movemask(vmask);
+
+# ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+ (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+ return cmask;
+# else
+ return mask & 3;
+# endif
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tnear,
+ const ssef& tfar,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect(kg,
+ P,
+ dir,
+ tnear,
+ tfar,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect(kg,
+ P,
+ dir,
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ const float3& P,
+ const float3& dir,
+ const ssef& tnear,
+ const ssef& tfar,
+ const ssef& tsplat,
+ const ssef Psplat[3],
+ const ssef idirsplat[3],
+ const shuffle_swap_t shufflexyz[3],
+ const float difl,
+ const float extmax,
+ const int nodeAddr,
+ const uint visibility,
+ float dist[2])
+{
+ float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return bvh_unaligned_node_intersect_robust(kg,
+ P,
+ dir,
+ tnear,
+ tfar,
+ difl,
+ extmax,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+ else {
+ return bvh_aligned_node_intersect_robust(kg,
+ P,
+ dir,
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ difl,
+ extmax,
+ nodeAddr,
+ visibility,
+ dist);
+ }
+}
+#endif /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 60cc50e8bfd..a54c6024152 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -21,6 +21,12 @@
# include "geom_qbvh_shadow.h"
#endif
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
@@ -41,7 +47,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
* - likely and unlikely for if() statements
* - test restrict attribute for pointers
*/
-
+
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
@@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
+
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect_t);
+# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@@ -94,86 +103,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
- bool traverseChild0, traverseChild1;
- int nodeAddrChild1;
+ int nodeAddrChild1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
- /* Intersect two child bounding boxes, non-SSE version */
- float t = isect_t;
-
- /* fetch node data */
- float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
- float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
- float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
- /* intersect ray against child nodes */
- float c0lox = (node0.x - P.x) * idir.x;
- float c0hix = (node0.z - P.x) * idir.x;
- float c0loy = (node1.x - P.y) * idir.y;
- float c0hiy = (node1.z - P.y) * idir.y;
- float c0loz = (node2.x - P.z) * idir.z;
- float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- float c1lox = (node0.y - P.x) * idir.x;
- float c1hix = (node0.w - P.x) * idir.x;
- float c1loy = (node1.y - P.y) * idir.y;
- float c1hiy = (node1.w - P.y) * idir.y;
- float c1loz = (node2.y - P.z) * idir.z;
- float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
- /* decide which nodes to traverse next */
-# ifdef __VISIBILITY_FLAG__
- /* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
- traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
-# else
- traverseChild0 = (c0max >= c0min);
- traverseChild1 = (c1max >= c1min);
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
# endif
-
+ idir,
+ isect_t,
+ nodeAddr,
+ PATH_RAY_SHADOW,
+ dist);
#else // __KERNEL_SSE2__
- /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
- /* fetch node data */
- const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
- const float4 cnodes = ((float4*)bvh_nodes)[3];
-
- /* intersect ray against child nodes */
- const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
- const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
- const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
- /* calculate { c0min, c1min, -c0max, -c1max} */
- const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
- const ssef tminmax = minmax ^ pn;
- const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
- /* decide which nodes to traverse next */
-# ifdef __VISIBILITY_FLAG__
- /* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
- traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
-# else
- traverseChild0 = (movemask(lrhit) & 1);
- traverseChild1 = (movemask(lrhit) & 2);
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ nodeAddr,
+ PATH_RAY_SHADOW,
+ dist);
#endif // __KERNEL_SSE2__
- nodeAddr = __float_as_int(cnodes.x);
- nodeAddrChild1 = __float_as_int(cnodes.y);
+ nodeAddr = __float_as_int(cnodes.z);
+ nodeAddrChild1 = __float_as_int(cnodes.w);
- if(traverseChild0 && traverseChild1) {
- /* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
- bool closestChild1 = (c1min < c0min);
-#else
- bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@@ -186,12 +153,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
- /* one child was intersected */
- if(traverseChild1) {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
- else if(!traverseChild0) {
- /* neither child was intersected */
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@@ -238,7 +205,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if BVH_FEATURE(BVH_HAIR)
case PRIMITIVE_CURVE:
case PRIMITIVE_MOTION_CURVE: {
- if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
else
hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
@@ -317,6 +284,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -369,6 +339,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -410,3 +383,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index e43eedad7bc..88aaf01d682 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -21,6 +21,12 @@
# include "geom_qbvh_subsurface.h"
#endif
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
/* This is a template BVH traversal function for subsurface scattering, where
* various features can be enabled/disabled. This way we can compile optimized
* versions for each case without new features slowing things down.
@@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect_t);
+# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@@ -100,79 +109,47 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
/* traversal loop */
do {
- do
- {
+ do {
/* traverse internal nodes */
- while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
- {
- bool traverseChild0, traverseChild1;
- int nodeAddrChild1;
+ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+ int nodeAddrChild1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
- /* Intersect two child bounding boxes, non-SSE version */
- float t = isect_t;
-
- /* fetch node data */
- float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
- float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
- float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
- /* intersect ray against child nodes */
- float c0lox = (node0.x - P.x) * idir.x;
- float c0hix = (node0.z - P.x) * idir.x;
- float c0loy = (node1.x - P.y) * idir.y;
- float c0hiy = (node1.z - P.y) * idir.y;
- float c0loz = (node2.x - P.z) * idir.z;
- float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- float c1lox = (node0.y - P.x) * idir.x;
- float c1hix = (node0.w - P.x) * idir.x;
- float c1loy = (node1.y - P.y) * idir.y;
- float c1hiy = (node1.w - P.y) * idir.y;
- float c1loz = (node2.y - P.z) * idir.z;
- float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
- /* decide which nodes to traverse next */
- traverseChild0 = (c0max >= c0min);
- traverseChild1 = (c1max >= c1min);
-
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect_t,
+ nodeAddr,
+ PATH_RAY_ALL_VISIBILITY,
+ dist);
#else // __KERNEL_SSE2__
- /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
- /* fetch node data */
- const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
- const float4 cnodes = ((float4*)bvh_nodes)[3];
-
- /* intersect ray against child nodes */
- const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
- const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
- const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
- /* calculate { c0min, c1min, -c0max, -c1max} */
- const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
- const ssef tminmax = minmax ^ pn;
- const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
- /* decide which nodes to traverse next */
- traverseChild0 = (movemask(lrhit) & 1);
- traverseChild1 = (movemask(lrhit) & 2);
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ nodeAddr,
+ PATH_RAY_ALL_VISIBILITY,
+ dist);
#endif // __KERNEL_SSE2__
- nodeAddr = __float_as_int(cnodes.x);
- nodeAddrChild1 = __float_as_int(cnodes.y);
+ nodeAddr = __float_as_int(cnodes.z);
+ nodeAddrChild1 = __float_as_int(cnodes.w);
- if(traverseChild0 && traverseChild1) {
- /* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
- bool closestChild1 = (c1min < c0min);
-#else
- bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@@ -185,12 +162,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
- /* one child was intersected */
- if(traverseChild1) {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
- else if(!traverseChild0) {
- /* neither child was intersected */
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@@ -286,3 +263,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 5b9c7b46f82..f409dd5f403 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -21,6 +21,14 @@
# include "geom_qbvh_traversal.h"
#endif
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
+# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
+#endif
+
/* This is a template BVH traversal function, where various features can be
* enabled/disabled. This way we can compile optimized versions for each case
* without new features slowing things down.
@@ -49,7 +57,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
* - likely and unlikely for if() statements
* - test restrict attribute for pointers
*/
-
+
/* traversal stack in CUDA thread-local memory */
int traversalStack[BVH_STACK_SIZE];
traversalStack[0] = ENTRYPOINT_SENTINEL;
@@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
+
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect->t);
+# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@@ -101,121 +112,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
- bool traverseChild0, traverseChild1;
- int nodeAddrChild1;
+ int nodeAddrChild1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
- /* Intersect two child bounding boxes, non-SSE version */
- float t = isect->t;
-
- /* fetch node data */
- float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
- float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
- float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
- /* intersect ray against child nodes */
- float c0lox = (node0.x - P.x) * idir.x;
- float c0hix = (node0.z - P.x) * idir.x;
- float c0loy = (node1.x - P.y) * idir.y;
- float c0hiy = (node1.z - P.y) * idir.y;
- float c0loz = (node2.x - P.z) * idir.z;
- float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- float c1lox = (node0.y - P.x) * idir.x;
- float c1hix = (node0.w - P.x) * idir.x;
- float c1loy = (node1.y - P.y) * idir.y;
- float c1hiy = (node1.w - P.y) * idir.y;
- float c1loz = (node2.y - P.z) * idir.z;
- float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(difl != 0.0f) {
- float hdiff = 1.0f + difl;
- float ldiff = 1.0f - difl;
- if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
- c0min = max(ldiff * c0min, c0min - extmax);
- c0max = min(hdiff * c0max, c0max + extmax);
- }
- if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
- c1min = max(ldiff * c1min, c1min - extmax);
- c1max = min(hdiff * c1max, c1max + extmax);
- }
+ traverse_mask = NODE_INTERSECT_ROBUST(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect->t,
+ difl,
+ extmax,
+ nodeAddr,
+ visibility,
+ dist);
}
+ else
# endif
-
- /* decide which nodes to traverse next */
-# ifdef __VISIBILITY_FLAG__
- /* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
- traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-# else
- traverseChild0 = (c0max >= c0min);
- traverseChild1 = (c1max >= c1min);
-# endif
-
+ {
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect->t,
+ nodeAddr,
+ visibility,
+ dist);
+ }
#else // __KERNEL_SSE2__
- /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
- /* fetch node data */
- const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
- const float4 cnodes = ((float4*)bvh_nodes)[3];
-
- /* intersect ray against child nodes */
- const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
- const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
- const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
- /* calculate { c0min, c1min, -c0max, -c1max} */
- ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
- const ssef tminmax = minmax ^ pn;
-
# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(difl != 0.0f) {
- float4 *tminmaxview = (float4*)&tminmax;
- float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
- float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
-
- float hdiff = 1.0f + difl;
- float ldiff = 1.0f - difl;
- if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
- c0min = max(ldiff * c0min, c0min - extmax);
- c0max = min(hdiff * c0max, c0max + extmax);
- }
- if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
- c1min = max(ldiff * c1min, c1min - extmax);
- c1max = min(hdiff * c1max, c1max + extmax);
- }
+ traverse_mask = NODE_INTERSECT_ROBUST(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ difl,
+ extmax,
+ nodeAddr,
+ visibility,
+ dist);
}
+ else
# endif
-
- const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
- /* decide which nodes to traverse next */
-# ifdef __VISIBILITY_FLAG__
- /* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
- traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-# else
- traverseChild0 = (movemask(lrhit) & 1);
- traverseChild1 = (movemask(lrhit) & 2);
-# endif
+ {
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ nodeAddr,
+ visibility,
+ dist);
+ }
#endif // __KERNEL_SSE2__
- nodeAddr = __float_as_int(cnodes.x);
- nodeAddrChild1 = __float_as_int(cnodes.y);
+ nodeAddr = __float_as_int(cnodes.z);
+ nodeAddrChild1 = __float_as_int(cnodes.w);
- if(traverseChild0 && traverseChild1) {
- /* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
- bool closestChild1 = (c1min < c0min);
-#else
- bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@@ -228,12 +204,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
- /* one child was intersected */
- if(traverseChild1) {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
- else if(!traverseChild0) {
- /* neither child was intersected */
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@@ -268,6 +244,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
#else
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
@@ -287,6 +266,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
# else
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
@@ -313,6 +295,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
# else
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
@@ -342,6 +327,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -376,6 +364,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -433,3 +424,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
index 36033ecf459..5e70ce99f51 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -18,7 +18,13 @@
*/
#ifdef __QBVH__
-#include "geom_qbvh_volume.h"
+# include "geom_qbvh_volume.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
#endif
/* This is a template BVH traversal function for volumes, where
@@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
+
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect->t);
+# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@@ -91,75 +100,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
- bool traverseChild0, traverseChild1;
- int nodeAddrChild1;
+ int nodeAddrChild1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
- /* Intersect two child bounding boxes, non-SSE version */
- float t = isect->t;
-
- /* fetch node data */
- float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
- float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
- float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
- /* intersect ray against child nodes */
- float c0lox = (node0.x - P.x) * idir.x;
- float c0hix = (node0.z - P.x) * idir.x;
- float c0loy = (node1.x - P.y) * idir.y;
- float c0hiy = (node1.z - P.y) * idir.y;
- float c0loz = (node2.x - P.z) * idir.z;
- float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- float c1lox = (node0.y - P.x) * idir.x;
- float c1hix = (node0.w - P.x) * idir.x;
- float c1loy = (node1.y - P.y) * idir.y;
- float c1hiy = (node1.w - P.y) * idir.y;
- float c1loz = (node2.y - P.z) * idir.z;
- float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
- /* decide which nodes to traverse next */
- traverseChild0 = (c0max >= c0min);
- traverseChild1 = (c1max >= c1min);
-
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect->t,
+ nodeAddr,
+ visibility,
+ dist);
#else // __KERNEL_SSE2__
- /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
- /* fetch node data */
- const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
- const float4 cnodes = ((float4*)bvh_nodes)[3];
-
- /* intersect ray against child nodes */
- const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
- const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
- const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
- /* calculate { c0min, c1min, -c0max, -c1max} */
- ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
- const ssef tminmax = minmax ^ pn;
-
- const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
- /* decide which nodes to traverse next */
- traverseChild0 = (movemask(lrhit) & 1);
- traverseChild1 = (movemask(lrhit) & 2);
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ nodeAddr,
+ visibility,
+ dist);
#endif // __KERNEL_SSE2__
- nodeAddr = __float_as_int(cnodes.x);
- nodeAddrChild1 = __float_as_int(cnodes.y);
+ nodeAddr = __float_as_int(cnodes.z);
+ nodeAddrChild1 = __float_as_int(cnodes.w);
- if(traverseChild0 && traverseChild1) {
- /* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
- bool closestChild1 = (c1min < c0min);
-#else
- bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@@ -172,12 +150,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
- /* one child was intersected */
- if(traverseChild1) {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
- else if(!traverseChild0) {
- /* neither child was intersected */
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@@ -258,6 +236,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -298,6 +279,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect->t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -337,3 +321,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
index f9536148933..ab5ac8505a3 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
@@ -18,7 +18,13 @@
*/
#ifdef __QBVH__
-#include "geom_qbvh_volume_all.h"
+# include "geom_qbvh_volume_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT bvh_node_intersect
+#else
+# define NODE_INTERSECT bvh_aligned_node_intersect
#endif
/* This is a template BVH traversal function for volumes, where
@@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
+
const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
ssef Psplat[3], idirsplat[3];
+# if BVH_FEATURE(BVH_HAIR)
+ ssef tnear(0.0f), tfar(isect_t);
+# endif
shuffle_swap_t shufflexyz[3];
Psplat[0] = ssef(P.x);
@@ -95,75 +104,44 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
do {
/* traverse internal nodes */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
- bool traverseChild0, traverseChild1;
- int nodeAddrChild1;
+ int nodeAddrChild1, traverse_mask;
+ float dist[2];
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
#if !defined(__KERNEL_SSE2__)
- /* Intersect two child bounding boxes, non-SSE version */
- float t = isect_array->t;
-
- /* fetch node data */
- float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
- float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
- float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
- /* intersect ray against child nodes */
- float c0lox = (node0.x - P.x) * idir.x;
- float c0hix = (node0.z - P.x) * idir.x;
- float c0loy = (node1.x - P.y) * idir.y;
- float c0hiy = (node1.z - P.y) * idir.y;
- float c0loz = (node2.x - P.z) * idir.z;
- float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
- float c1lox = (node0.y - P.x) * idir.x;
- float c1hix = (node0.w - P.x) * idir.x;
- float c1loy = (node1.y - P.y) * idir.y;
- float c1hiy = (node1.w - P.y) * idir.y;
- float c1loz = (node2.y - P.z) * idir.z;
- float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
- /* decide which nodes to traverse next */
- traverseChild0 = (c0max >= c0min);
- traverseChild1 = (c1max >= c1min);
-
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+# if BVH_FEATURE(BVH_HAIR)
+ dir,
+# endif
+ idir,
+ isect_t,
+ nodeAddr,
+ visibility,
+ dist);
#else // __KERNEL_SSE2__
- /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
- /* fetch node data */
- const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
- const float4 cnodes = ((float4*)bvh_nodes)[3];
-
- /* intersect ray against child nodes */
- const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
- const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
- const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
- /* calculate { c0min, c1min, -c0max, -c1max} */
- ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
- const ssef tminmax = minmax ^ pn;
-
- const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
- /* decide which nodes to traverse next */
- traverseChild0 = (movemask(lrhit) & 1);
- traverseChild1 = (movemask(lrhit) & 2);
+ traverse_mask = NODE_INTERSECT(kg,
+ P,
+ dir,
+# if BVH_FEATURE(BVH_HAIR)
+ tnear,
+ tfar,
+# endif
+ tsplat,
+ Psplat,
+ idirsplat,
+ shufflexyz,
+ nodeAddr,
+ visibility,
+ dist);
#endif // __KERNEL_SSE2__
- nodeAddr = __float_as_int(cnodes.x);
- nodeAddrChild1 = __float_as_int(cnodes.y);
+ nodeAddr = __float_as_int(cnodes.z);
+ nodeAddrChild1 = __float_as_int(cnodes.w);
- if(traverseChild0 && traverseChild1) {
- /* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
- bool closestChild1 = (c1min < c0min);
-#else
- bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+ if(traverse_mask == 3) {
+ /* Both children were intersected, push the farther one. */
+ bool closestChild1 = (dist[1] < dist[0]);
if(closestChild1) {
int tmp = nodeAddr;
@@ -176,12 +154,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
traversalStack[stackPtr] = nodeAddrChild1;
}
else {
- /* one child was intersected */
- if(traverseChild1) {
+ /* One child was intersected. */
+ if(traverse_mask == 2) {
nodeAddr = nodeAddrChild1;
}
- else if(!traverseChild0) {
- /* neither child was intersected */
+ else if(traverse_mask == 0) {
+ /* Neither child was intersected. */
nodeAddr = traversalStack[stackPtr];
--stackPtr;
}
@@ -311,6 +289,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -368,6 +349,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
Psplat[2] = ssef(P.z);
tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ tfar = ssef(isect_t);
+# endif
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
# endif
@@ -410,3 +394,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
#undef BVH_FUNCTION_NAME
#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
index 30ed851d861..5eda3213acb 100644
--- a/intern/cycles/kernel/geom/geom_qbvh.h
+++ b/intern/cycles/kernel/geom/geom_qbvh.h
@@ -51,23 +51,25 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
}
-ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
- const ssef& tnear,
- const ssef& tfar,
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
+ const ssef& tnear,
+ const ssef& tfar,
#ifdef __KERNEL_AVX2__
- const sse3f& org_idir,
+ const sse3f& org_idir,
#else
- const sse3f& org,
+ const sse3f& org,
#endif
- const sse3f& idir,
- const int near_x,
- const int near_y,
- const int near_z,
- const int far_x,
- const int far_y,
- const int far_z,
- const int nodeAddr,
- ssef *__restrict dist)
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int nodeAddr,
+ ssef *__restrict dist)
{
const int offset = nodeAddr + 1;
#ifdef __KERNEL_AVX2__
@@ -101,24 +103,25 @@ ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
return mask;
}
-ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
- const ssef& tnear,
- const ssef& tfar,
+ccl_device_inline int qbvh_aligned_node_intersect_robust(
+ KernelGlobals *__restrict kg,
+ const ssef& tnear,
+ const ssef& tfar,
#ifdef __KERNEL_AVX2__
- const sse3f& P_idir,
+ const sse3f& P_idir,
#else
- const sse3f& P,
+ const sse3f& P,
#endif
- const sse3f& idir,
- const int near_x,
- const int near_y,
- const int near_z,
- const int far_x,
- const int far_y,
- const int far_z,
- const int nodeAddr,
- const float difl,
- ssef *__restrict dist)
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int nodeAddr,
+ const float difl,
+ ssef *__restrict dist)
{
const int offset = nodeAddr + 1;
#ifdef __KERNEL_AVX2__
@@ -145,3 +148,286 @@ ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
*dist = tNear;
return (int)movemask(vmask);
}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int qbvh_unaligned_node_intersect(
+ KernelGlobals *__restrict kg,
+ const ssef& tnear,
+ const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+ const sse3f& org_idir,
+#endif
+ const sse3f& org,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int nodeAddr,
+ ssef *__restrict dist)
+{
+ const int offset = nodeAddr;
+ const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+ const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+ const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+ const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+ const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+ const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+ const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+ const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+ const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+ const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+ const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+ const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+ const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+ aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+ aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+ const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+ aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+ aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+ const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+ const ssef nrdir_x = neg_one / aligned_dir_x,
+ nrdir_y = neg_one / aligned_dir_y,
+ nrdir_z = neg_one / aligned_dir_z;
+
+ const ssef tlower_x = aligned_P_x * nrdir_x,
+ tlower_y = aligned_P_y * nrdir_y,
+ tlower_z = aligned_P_z * nrdir_z;
+
+ const ssef tupper_x = tlower_x - nrdir_x,
+ tupper_y = tlower_y - nrdir_y,
+ tupper_z = tlower_z - nrdir_z;
+
+#ifdef __KERNEL_SSE41__
+ const ssef tnear_x = mini(tlower_x, tupper_x);
+ const ssef tnear_y = mini(tlower_y, tupper_y);
+ const ssef tnear_z = mini(tlower_z, tupper_z);
+ const ssef tfar_x = maxi(tlower_x, tupper_x);
+ const ssef tfar_y = maxi(tlower_y, tupper_y);
+ const ssef tfar_z = maxi(tlower_z, tupper_z);
+ const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+ const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+ const sseb vmask = tNear <= tFar;
+ *dist = tNear;
+ return movemask(vmask);
+#else
+ const ssef tnear_x = min(tlower_x, tupper_x);
+ const ssef tnear_y = min(tlower_y, tupper_y);
+ const ssef tnear_z = min(tlower_z, tupper_z);
+ const ssef tfar_x = max(tlower_x, tupper_x);
+ const ssef tfar_y = max(tlower_y, tupper_y);
+ const ssef tfar_z = max(tlower_z, tupper_z);
+ const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+ const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+ const sseb vmask = tNear <= tFar;
+ *dist = tNear;
+ return movemask(vmask);
+#endif
+}
+
+ccl_device_inline int qbvh_unaligned_node_intersect_robust(
+ KernelGlobals *__restrict kg,
+ const ssef& tnear,
+ const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+ const sse3f& P_idir,
+#endif
+ const sse3f& P,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int nodeAddr,
+ const float difl,
+ ssef *__restrict dist)
+{
+ const int offset = nodeAddr;
+ const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+ const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+ const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+ const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+ const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+ const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+ const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+ const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+ const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+ const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+ const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+ const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+ const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+ aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+ aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+ const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+ aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+ aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+ const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+ const ssef nrdir_x = neg_one / aligned_dir_x,
+ nrdir_y = neg_one / aligned_dir_y,
+ nrdir_z = neg_one / aligned_dir_z;
+
+ const ssef tlower_x = aligned_P_x * nrdir_x,
+ tlower_y = aligned_P_y * nrdir_y,
+ tlower_z = aligned_P_z * nrdir_z;
+
+ const ssef tupper_x = tlower_x - nrdir_x,
+ tupper_y = tlower_y - nrdir_y,
+ tupper_z = tlower_z - nrdir_z;
+
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+
+#ifdef __KERNEL_SSE41__
+ const ssef tnear_x = mini(tlower_x, tupper_x);
+ const ssef tnear_y = mini(tlower_y, tupper_y);
+ const ssef tnear_z = mini(tlower_z, tupper_z);
+ const ssef tfar_x = maxi(tlower_x, tupper_x);
+ const ssef tfar_y = maxi(tlower_y, tupper_y);
+ const ssef tfar_z = maxi(tlower_z, tupper_z);
+#else
+ const ssef tnear_x = min(tlower_x, tupper_x);
+ const ssef tnear_y = min(tlower_y, tupper_y);
+ const ssef tnear_z = min(tlower_z, tupper_z);
+ const ssef tfar_x = max(tlower_x, tupper_x);
+ const ssef tfar_y = max(tlower_y, tupper_y);
+ const ssef tfar_z = max(tlower_z, tupper_z);
+#endif
+ const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+ const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+ const sseb vmask = round_down*tNear <= round_up*tFar;
+ *dist = tNear;
+ return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int qbvh_node_intersect(
+ KernelGlobals *__restrict kg,
+ const ssef& tnear,
+ const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+ const sse3f& org_idir,
+#endif
+ const sse3f& org,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int nodeAddr,
+ ssef *__restrict dist)
+{
+ const int offset = nodeAddr;
+ const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return qbvh_unaligned_node_intersect(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ org_idir,
+#endif
+ org,
+ dir,
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ dist);
+ }
+ else {
+ return qbvh_aligned_node_intersect(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ org_idir,
+#else
+ org,
+#endif
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ dist);
+ }
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(
+ KernelGlobals *__restrict kg,
+ const ssef& tnear,
+ const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+ const sse3f& P_idir,
+#endif
+ const sse3f& P,
+ const sse3f& dir,
+ const sse3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int nodeAddr,
+ const float difl,
+ ssef *__restrict dist)
+{
+ const int offset = nodeAddr;
+ const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return qbvh_unaligned_node_intersect_robust(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir,
+#endif
+ P,
+ dir,
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ difl,
+ dist);
+ }
+ else {
+ return qbvh_aligned_node_intersect_robust(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir,
+#else
+ P,
+#endif
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ difl,
+ dist);
+ }
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
index 97a0ceb0687..e5e611a0d47 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@@ -27,6 +27,12 @@
*
*/
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
@@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef tnear(0.0f), tfar(tmax);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
- sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
- sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@@ -109,22 +119,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef dist;
- int traverseChild = qbvh_node_intersect(kg,
- tnear,
- tfar,
+ int traverseChild = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
#ifdef __KERNEL_AVX2__
- P_idir4,
-#else
- org,
+ P_idir4,
#endif
- idir4,
- near_x, near_y, near_z,
- far_x, far_y, far_z,
- nodeAddr,
- &dist);
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+# endif
+# if BVH_FEATURE(BVH_HAIR)
+ dir4,
+# endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ &dist);
if(traverseChild != 0) {
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ }
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@@ -340,13 +363,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
++stackPtr;
@@ -394,13 +422,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(tmax);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
object = OBJECT_NONE;
@@ -412,3 +445,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return false;
}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
index 5d76ac4b1f1..4adaf9c8f3d 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@@ -25,6 +25,12 @@
*
*/
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
SubsurfaceIntersection *ss_isect,
@@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
- sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
- sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@@ -108,22 +118,37 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
/* Traverse internal nodes. */
while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
ssef dist;
- int traverseChild = qbvh_node_intersect(kg,
- tnear,
- tfar,
+
+ int traverseChild = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
#ifdef __KERNEL_AVX2__
- P_idir4,
-#else
- org,
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
#endif
- idir4,
- near_x, near_y, near_z,
- far_x, far_y, far_z,
- nodeAddr,
- &dist);
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ &dist);
if(traverseChild != 0) {
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ }
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@@ -270,3 +295,5 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
} while(nodeAddr != ENTRYPOINT_SENTINEL);
} while(nodeAddr != ENTRYPOINT_SENTINEL);
}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
index 1588ae3605c..24bf85f46c8 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@@ -28,6 +28,14 @@
*
*/
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
+#endif
+
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
@@ -81,13 +89,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
BVH_DEBUG_INIT();
ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
- sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@@ -132,41 +144,62 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
*
* Need to test if doing opposite would be any faster.
*/
- traverseChild = qbvh_node_intersect_robust(kg,
- tnear,
- tfar,
+ traverseChild = NODE_INTERSECT_ROBUST(kg,
+ tnear,
+ tfar,
# ifdef __KERNEL_AVX2__
- P_idir4,
-# else
- org,
+ P_idir4,
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
# endif
- idir4,
- near_x, near_y, near_z,
- far_x, far_y, far_z,
- nodeAddr,
- difl,
- &dist);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4,
+# endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ difl,
+ &dist);
}
else
#endif /* BVH_HAIR_MINIMUM_WIDTH */
{
- traverseChild = qbvh_node_intersect(kg,
- tnear,
- tfar,
+ traverseChild = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
#ifdef __KERNEL_AVX2__
- P_idir4,
-#else
- org,
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
#endif
- idir4,
- near_x, near_y, near_z,
- far_x, far_y, far_z,
- nodeAddr,
- &dist);
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ &dist);
}
if(traverseChild != 0) {
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ float4 cnodes;
+ /* TODO(sergey): Investigate whether moving cnodes upwards
+ * gives a speedup (will be different cache pattern but will
+ * avoid extra check here),
+ */
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ }
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@@ -361,13 +394,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
++stackPtr;
@@ -398,13 +436,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
object = OBJECT_NONE;
@@ -417,3 +460,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return (isect->prim != PRIM_NONE);
}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h
index d66c6e2b1e5..da21ede9e12 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h
@@ -26,6 +26,12 @@
*
*/
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
@@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
isect->object = OBJECT_NONE;
ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
- sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
- sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@@ -104,22 +114,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef dist;
- int traverseChild = qbvh_node_intersect(kg,
- tnear,
- tfar,
+ int traverseChild = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
#ifdef __KERNEL_AVX2__
- P_idir4,
-#else
- org,
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
#endif
- idir4,
- near_x, near_y, near_z,
- far_x, far_y, far_z,
- nodeAddr,
- &dist);
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ &dist);
if(traverseChild != 0) {
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ }
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@@ -278,13 +301,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
++stackPtr;
@@ -319,13 +347,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
object = OBJECT_NONE;
@@ -337,3 +370,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return (isect->prim != PRIM_NONE);
}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
index 89950f17c64..8a31775fae3 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
@@ -26,6 +26,12 @@
*
*/
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT qbvh_node_intersect
+#else
+# define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
@@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
#ifdef __KERNEL_AVX2__
float3 P_idir = P*idir;
- sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
- sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+ sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
#endif
/* Offsets to select the side that becomes the lower or upper bound. */
@@ -108,22 +118,35 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
#endif
ssef dist;
- int traverseChild = qbvh_node_intersect(kg,
- tnear,
- tfar,
+ int traverseChild = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
#ifdef __KERNEL_AVX2__
- P_idir4,
-#else
- org,
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
#endif
- idir4,
- near_x, near_y, near_z,
- far_x, far_y, far_z,
- nodeAddr,
- &dist);
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ nodeAddr,
+ &dist);
if(traverseChild != 0) {
- float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+ }
/* One child is hit, continue with that child. */
int r = __bscf(traverseChild);
@@ -330,12 +353,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect_t);
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
num_hits_in_instance = 0;
isect_array->t = isect_t;
@@ -389,13 +417,18 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
tfar = ssef(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+# endif
idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
# ifdef __KERNEL_AVX2__
P_idir = P*idir;
P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-# else
- org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+# endif
+
triangle_intersect_precalc(dir, &isect_precalc);
isect_t = tmax;
isect_array->t = isect_t;
@@ -409,3 +442,5 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
return num_hits;
}
+
+#undef NODE_INTERSECT