diff options
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh.h | 18 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_nodes.h | 659 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_shadow.h | 132 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_subsurface.h | 118 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_traversal.h | 207 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_volume.h | 123 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_volume_all.h | 123 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh.h | 344 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_shadow.h | 73 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_subsurface.h | 57 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_traversal.h | 106 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_volume.h | 73 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_volume_all.h | 73 |
14 files changed, 1574 insertions, 533 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index f0adbc03e22..3c2f7747f34 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -141,6 +141,7 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_bvh.h + geom/geom_bvh_nodes.h geom/geom_bvh_shadow.h geom/geom_bvh_subsurface.h geom/geom_bvh_traversal.h diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h index d0eedd3396a..f8d563f0afa 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -77,6 +77,8 @@ CCL_NAMESPACE_BEGIN /* Regular BVH traversal */ +#include "geom_bvh_nodes.h" + #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 #include "geom_bvh_traversal.h" @@ -109,13 +111,13 @@ CCL_NAMESPACE_BEGIN #if defined(__SUBSURFACE__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface -# define BVH_FUNCTION_FEATURES 0 +# define BVH_FUNCTION_FEATURES BVH_HAIR # include "geom_bvh_subsurface.h" #endif #if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion -# define BVH_FUNCTION_FEATURES BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR # include "geom_bvh_subsurface.h" #endif @@ -123,19 +125,19 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME__) # define BVH_FUNCTION_NAME bvh_intersect_volume -# define BVH_FUNCTION_FEATURES 0 +# define BVH_FUNCTION_FEATURES BVH_HAIR # include "geom_bvh_volume.h" #endif #if defined(__VOLUME__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR # include "geom_bvh_volume.h" #endif #if defined(__VOLUME__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR # include "geom_bvh_volume.h" #endif @@ -175,19 +177,19 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all -# define BVH_FUNCTION_FEATURES 0 +# define BVH_FUNCTION_FEATURES BVH_HAIR # include "geom_bvh_volume_all.h" #endif #if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR # include "geom_bvh_volume_all.h" #endif #if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR # include "geom_bvh_volume_all.h" #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_nodes.h b/intern/cycles/kernel/geom/geom_bvh_nodes.h new file mode 100644 index 00000000000..deb91ec95f5 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh_nodes.h @@ -0,0 +1,659 @@ +/* + * Copyright 2011-2016, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and +// 3-vector which might be faster. +ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, + int nodeAddr, + int child) +{ + Transform space; + const int child_addr = nodeAddr + child * 3; + space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1); + space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2); + space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3); + space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + return space; +} + +#if !defined(__KERNEL_SSE2__) +ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 idir, + const float t, + const int nodeAddr, + const uint visibility, + float *dist) +{ + + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + dist[0] = c0min; + dist[1] = c1min; + +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); +#else + return ((c0max >= c0min)? 1: 0) | + ((c1max >= c1min)? 2: 0); +#endif +} + +ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float *dist) +{ + + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + if(difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + dist[0] = c0min; + dist[1] = c1min; + +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); +#else + return ((c0max >= c0min)? 1: 0) | + ((c1max >= c1min)? 2: 0); +#endif +} + +ccl_device_inline bool bvh_unaligned_node_intersect_child( + KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + int nodeAddr, + int child, + float *dist) +{ + Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 tLowerXYZ = aligned_P * nrdir; + float3 tUpperXYZ = tLowerXYZ - nrdir; + const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x); + const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y); + const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z); + const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x); + const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y); + const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z); + const float tNear = max4(0.0f, tNearX, tNearY, tNearZ); + const float tFar = min4(t, tFarX, tFarY, tFarZ); + *dist = tNear; + return tNear <= tFar; +} + +ccl_device_inline bool bvh_unaligned_node_intersect_child_robust( + KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + const float difl, + const float /*extmax*/, + int nodeAddr, + int child, + float *dist) +{ + Transform space = bvh_unaligned_node_fetch_space(kg, nodeAddr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 tLowerXYZ = aligned_P * nrdir; + float3 tUpperXYZ = tLowerXYZ - nrdir; + const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x); + const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y); + const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z); + const float tFarX = max(tLowerXYZ.x, tUpperXYZ.x); + const float tFarY = max(tLowerXYZ.y, tUpperXYZ.y); + const float tFarZ = max(tLowerXYZ.z, tUpperXYZ.z); + const float tNear = max4(0.0f, tNearX, tNearY, tNearZ); + const float tFar = min4(t, tFarX, tFarY, tFarZ); + *dist = tNear; + if(difl != 0.0f) { + /* TODO(sergey): Same as for QBVH, needs a proper use. */ + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + return round_down*tNear <= round_up*tFar; + } + else { + return tNear <= tFar; + } +} + +ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int nodeAddr, + const uint visibility, + float *dist) +{ + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.x) & visibility)) +#endif + { + mask |= 1; + } + } + if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.y) & visibility)) +#endif + { + mask |= 2; + } + } + return mask; +} + +ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float *dist) +{ + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 0, &dist[0])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.x) & visibility)) +#endif + { + mask |= 1; + } + } + if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, extmax, nodeAddr, 1, &dist[1])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.y) & visibility)) +#endif + { + mask |= 2; + } + } + return mask; +} + +ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, + P, + dir, + idir, + t, + nodeAddr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect(kg, + P, + idir, + t, + nodeAddr, + visibility, + dist); + } +} + +ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust(kg, + P, + dir, + idir, + t, + difl, + extmax, + nodeAddr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + idir, + t, + difl, + extmax, + nodeAddr, + visibility, + dist); + } +} +#else /* !defined(__KERNEL_SSE2__) */ + +int ccl_device_inline bvh_aligned_node_intersect( + KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_aligned_node_intersect_robust( + KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + if(difl != 0.0f) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + float4 *tminmaxview = (float4*)&tminmax; + float& c0min = tminmaxview->x, &c1min = tminmaxview->y; + float& c0max = tminmaxview->z, &c1max = tminmaxview->w; + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const ssef& tnear, + const ssef& tfar, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir);; + float3 aligned_P0 = transform_point(&space0, P), + aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef tLowerX = ssef(aligned_P0.x * nrdir0.x, + aligned_P1.x * nrdir1.x, + 0.0f, 0.0f), + tLowerY = ssef(aligned_P0.y * nrdir0.y, + aligned_P1.y * nrdir1.y, + 0.0f, + 0.0f), + tLowerZ = ssef(aligned_P0.z * nrdir0.z, + aligned_P1.z * nrdir1.z, + 0.0f, + 0.0f); + + ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(tLowerX, tUpperX); + ssef tnear_y = min(tLowerY, tUpperY); + ssef tnear_z = min(tLowerZ, tUpperZ); + ssef tfar_x = max(tLowerX, tUpperX); + ssef tfar_y = max(tLowerY, tUpperY); + ssef tfar_z = max(tLowerZ, tUpperZ); + + const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); + const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); + sseb vmask = tNear <= tFar; + dist[0] = tNear.f[0]; + dist[1] = tNear.f[1]; + + int mask = (int)movemask(vmask); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const ssef& tnear, + const ssef& tfar, + const float difl, + const float /*extmax*/, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir);; + float3 aligned_P0 = transform_point(&space0, P), + aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef tLowerX = ssef(aligned_P0.x * nrdir0.x, + aligned_P1.x * nrdir1.x, + 0.0f, 0.0f), + tLowerY = ssef(aligned_P0.y * nrdir0.y, + aligned_P1.y * nrdir1.y, + 0.0f, + 0.0f), + tLowerZ = ssef(aligned_P0.z * nrdir0.z, + aligned_P1.z * nrdir1.z, + 0.0f, + 0.0f); + + ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(tLowerX, tUpperX); + ssef tnear_y = min(tLowerY, tUpperY); + ssef tnear_z = min(tLowerZ, tUpperZ); + ssef tfar_x = max(tLowerX, tUpperX); + ssef tfar_y = max(tLowerY, tUpperY); + ssef tfar_z = max(tLowerZ, tUpperZ); + + const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); + const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); + sseb vmask; + if(difl != 0.0f) { + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + vmask = round_down*tNear <= round_up*tFar; + } + else { + vmask = tNear <= tFar; + } + + dist[0] = tNear.f[0]; + dist[1] = tNear.f[1]; + + int mask = (int)movemask(vmask); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tnear, + const ssef& tfar, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, + P, + dir, + tnear, + tfar, + nodeAddr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + nodeAddr, + visibility, + dist); + } +} + +ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tnear, + const ssef& tfar, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust(kg, + P, + dir, + tnear, + tfar, + difl, + extmax, + nodeAddr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + nodeAddr, + visibility, + dist); + } +} +#endif /* !defined(__KERNEL_SSE2__) */ diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h index 60cc50e8bfd..a54c6024152 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h @@ -21,6 +21,12 @@ # include "geom_qbvh_shadow.h" #endif +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect +#endif + /* This is a template BVH traversal function, where various features can be * enabled/disabled. This way we can compile optimized versions for each case * without new features slowing things down. @@ -41,7 +47,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, * - likely and unlikely for if() statements * - test restrict attribute for pointers */ - + /* traversal stack in CUDA thread-local memory */ int traversalStack[BVH_STACK_SIZE]; traversalStack[0] = ENTRYPOINT_SENTINEL; @@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -94,86 +103,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { /* traverse internal nodes */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + int nodeAddrChild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); -# else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, # endif - + idir, + isect_t, + nodeAddr, + PATH_RAY_SHADOW, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); -# else - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, # endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + nodeAddr, + PATH_RAY_SHADOW, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + nodeAddr = __float_as_int(cnodes.z); + nodeAddrChild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool closestChild1 = (dist[1] < dist[0]); if(closestChild1) { int tmp = nodeAddr; @@ -186,12 +153,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, traversalStack[stackPtr] = nodeAddrChild1; } else { - /* one child was intersected */ - if(traverseChild1) { + /* One child was intersected. */ + if(traverse_mask == 2) { nodeAddr = nodeAddrChild1; } - else if(!traverseChild0) { - /* neither child was intersected */ + else if(traverse_mask == 0) { + /* Neither child was intersected. */ nodeAddr = traversalStack[stackPtr]; --stackPtr; } @@ -238,7 +205,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); else hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); @@ -317,6 +284,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -369,6 +339,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -410,3 +383,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index e43eedad7bc..88aaf01d682 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -21,6 +21,12 @@ # include "geom_qbvh_subsurface.h" #endif +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect +#endif + /* This is a template BVH traversal function for subsurface scattering, where * various features can be enabled/disabled. This way we can compile optimized * versions for each case without new features slowing things down. @@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -100,79 +109,47 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* traversal loop */ do { - do - { + do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) - { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + int nodeAddrChild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect_t, + nodeAddr, + PATH_RAY_ALL_VISIBILITY, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + nodeAddr, + PATH_RAY_ALL_VISIBILITY, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + nodeAddr = __float_as_int(cnodes.z); + nodeAddrChild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool closestChild1 = (dist[1] < dist[0]); if(closestChild1) { int tmp = nodeAddr; @@ -185,12 +162,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, traversalStack[stackPtr] = nodeAddrChild1; } else { - /* one child was intersected */ - if(traverseChild1) { + /* One child was intersected. */ + if(traverse_mask == 2) { nodeAddr = nodeAddrChild1; } - else if(!traverseChild0) { - /* neither child was intersected */ + else if(traverse_mask == 0) { + /* Neither child was intersected. */ nodeAddr = traversalStack[stackPtr]; --stackPtr; } @@ -286,3 +263,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h index 5b9c7b46f82..f409dd5f403 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -21,6 +21,14 @@ # include "geom_qbvh_traversal.h" #endif +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust +#else +# define NODE_INTERSECT bvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust +#endif + /* This is a template BVH traversal function, where various features can be * enabled/disabled. This way we can compile optimized versions for each case * without new features slowing things down. @@ -49,7 +57,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, * - likely and unlikely for if() statements * - test restrict attribute for pointers */ - + /* traversal stack in CUDA thread-local memory */ int traversalStack[BVH_STACK_SIZE]; traversalStack[0] = ENTRYPOINT_SENTINEL; @@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect->t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -101,121 +112,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { /* traverse internal nodes */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + int nodeAddrChild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + difl, + extmax, + nodeAddr, + visibility, + dist); } + else # endif - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); -# else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); -# endif - + { + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + nodeAddr, + visibility, + dist); + } #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { - float4 *tminmaxview = (float4*)&tminmax; - float &c0min = tminmaxview->x, &c1min = tminmaxview->y; - float &c0max = tminmaxview->z, &c1max = tminmaxview->w; - - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + nodeAddr, + visibility, + dist); } + else # endif - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); -# else - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); -# endif + { + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + nodeAddr, + visibility, + dist); + } #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + nodeAddr = __float_as_int(cnodes.z); + nodeAddrChild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool closestChild1 = (dist[1] < dist[0]); if(closestChild1) { int tmp = nodeAddr; @@ -228,12 +204,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, traversalStack[stackPtr] = nodeAddrChild1; } else { - /* one child was intersected */ - if(traverseChild1) { + /* One child was intersected. */ + if(traverse_mask == 2) { nodeAddr = nodeAddrChild1; } - else if(!traverseChild0) { - /* neither child was intersected */ + else if(traverse_mask == 0) { + /* Neither child was intersected. */ nodeAddr = traversalStack[stackPtr]; --stackPtr; } @@ -268,6 +244,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif #else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -287,6 +266,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif # else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -313,6 +295,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif # else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -342,6 +327,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -376,6 +364,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -433,3 +424,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h index 36033ecf459..5e70ce99f51 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume.h +++ b/intern/cycles/kernel/geom/geom_bvh_volume.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -#include "geom_qbvh_volume.h" +# include "geom_qbvh_volume.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for volumes, where @@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect->t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -91,75 +100,44 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { /* traverse internal nodes */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + int nodeAddrChild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + nodeAddr, + visibility, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + nodeAddr, + visibility, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + nodeAddr = __float_as_int(cnodes.z); + nodeAddrChild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool closestChild1 = (dist[1] < dist[0]); if(closestChild1) { int tmp = nodeAddr; @@ -172,12 +150,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, traversalStack[stackPtr] = nodeAddrChild1; } else { - /* one child was intersected */ - if(traverseChild1) { + /* One child was intersected. */ + if(traverse_mask == 2) { nodeAddr = nodeAddrChild1; } - else if(!traverseChild0) { - /* neither child was intersected */ + else if(traverse_mask == 0) { + /* Neither child was intersected. */ nodeAddr = traversalStack[stackPtr]; --stackPtr; } @@ -258,6 +236,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -298,6 +279,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -337,3 +321,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h index f9536148933..ab5ac8505a3 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h +++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -#include "geom_qbvh_volume_all.h" +# include "geom_qbvh_volume_all.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for volumes, where @@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -95,75 +104,44 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { /* traverse internal nodes */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + int nodeAddrChild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_array->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect_t, + nodeAddr, + visibility, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + nodeAddr, + visibility, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + nodeAddr = __float_as_int(cnodes.z); + nodeAddrChild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool closestChild1 = (dist[1] < dist[0]); if(closestChild1) { int tmp = nodeAddr; @@ -176,12 +154,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, traversalStack[stackPtr] = nodeAddrChild1; } else { - /* one child was intersected */ - if(traverseChild1) { + /* One child was intersected. */ + if(traverse_mask == 2) { nodeAddr = nodeAddrChild1; } - else if(!traverseChild0) { - /* neither child was intersected */ + else if(traverse_mask == 0) { + /* Neither child was intersected. */ nodeAddr = traversalStack[stackPtr]; --stackPtr; } @@ -311,6 +289,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -368,6 +349,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif @@ -410,3 +394,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h index 30ed851d861..5eda3213acb 100644 --- a/intern/cycles/kernel/geom/geom_qbvh.h +++ b/intern/cycles/kernel/geom/geom_qbvh.h @@ -51,23 +51,25 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1, if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } } -ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg, - const ssef& tnear, - const ssef& tfar, +/* Axis-aligned nodes intersection */ + +ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, #ifdef __KERNEL_AVX2__ - const sse3f& org_idir, + const sse3f& org_idir, #else - const sse3f& org, + const sse3f& org, #endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int nodeAddr, - ssef *__restrict dist) + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + ssef *__restrict dist) { const int offset = nodeAddr + 1; #ifdef __KERNEL_AVX2__ @@ -101,24 +103,25 @@ ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg, return mask; } -ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg, - const ssef& tnear, - const ssef& tfar, +ccl_device_inline int qbvh_aligned_node_intersect_robust( + KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, #ifdef __KERNEL_AVX2__ - const sse3f& P_idir, + const sse3f& P_idir, #else - const sse3f& P, + const sse3f& P, #endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int nodeAddr, - const float difl, - ssef *__restrict dist) + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + const float difl, + ssef *__restrict dist) { const int offset = nodeAddr + 1; #ifdef __KERNEL_AVX2__ @@ -145,3 +148,286 @@ ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg, *dist = tNear; return (int)movemask(vmask); } + +/* Unaligned nodes intersection */ + +ccl_device_inline int qbvh_unaligned_node_intersect( + KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#endif + const sse3f& org, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + ssef *__restrict dist) +{ + const int offset = nodeAddr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + + const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, + aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, + aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; + + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const ssef tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const ssef tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + +#ifdef __KERNEL_SSE41__ + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); + const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z); + const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z); + const sseb vmask = tNear <= tFar; + *dist = tNear; + return movemask(vmask); +#else + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); + const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z); + const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z); + const sseb vmask = tNear <= tFar; + *dist = tNear; + return movemask(vmask); +#endif +} + +ccl_device_inline int qbvh_unaligned_node_intersect_robust( + KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#endif + const sse3f& P, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + const float difl, + ssef *__restrict dist) +{ + const int offset = nodeAddr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + + const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, + aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, + aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; + + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const ssef tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const ssef tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + +#ifdef __KERNEL_SSE41__ + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); +#else + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); +#endif + const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z); + const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z); + const sseb vmask = round_down*tNear <= round_up*tFar; + *dist = tNear; + return movemask(vmask); +} + +/* Intersectors wrappers. + * + * They'll check node type and call appropriate intersection code. + */ + +ccl_device_inline int qbvh_node_intersect( + KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#endif + const sse3f& org, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + ssef *__restrict dist) +{ + const int offset = nodeAddr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + org_idir, +#endif + org, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + dist); + } + else { + return qbvh_aligned_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + org_idir, +#else + org, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + dist); + } +} + +ccl_device_inline int qbvh_node_intersect_robust( + KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#endif + const sse3f& P, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + const float difl, + ssef *__restrict dist) +{ + const int offset = nodeAddr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect_robust(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir, +#endif + P, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + difl, + dist); + } + else { + return qbvh_aligned_node_intersect_robust(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir, +#else + P, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + difl, + dist); + } +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h index 97a0ceb0687..e5e611a0d47 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h @@ -27,6 +27,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, @@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(tmax); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -109,22 +119,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int traverseChild = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + } /* One child is hit, continue with that child. */ int r = __bscf(traverseChild); @@ -340,13 +363,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); ++stackPtr; @@ -394,13 +422,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(tmax); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; @@ -412,3 +445,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, return false; } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h index 5d76ac4b1f1..4adaf9c8f3d 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h @@ -25,6 +25,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, SubsurfaceIntersection *ss_isect, @@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -108,22 +118,37 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Traverse internal nodes. */ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + + int traverseChild = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + } /* One child is hit, continue with that child. */ int r = __bscf(traverseChild); @@ -270,3 +295,5 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } while(nodeAddr != ENTRYPOINT_SENTINEL); } while(nodeAddr != ENTRYPOINT_SENTINEL); } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h index 1588ae3605c..24bf85f46c8 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h @@ -28,6 +28,14 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect, @@ -81,13 +89,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, BVH_DEBUG_INIT(); ssef tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -132,41 +144,62 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, * * Need to test if doing opposite would be any faster. */ - traverseChild = qbvh_node_intersect_robust(kg, - tnear, - tfar, + traverseChild = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, # ifdef __KERNEL_AVX2__ - P_idir4, -# else - org, + P_idir4, +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, # endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - difl, - &dist); +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + difl, + &dist); } else #endif /* BVH_HAIR_MINIMUM_WIDTH */ { - traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + traverseChild = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); } if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + float4 cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here), + */ +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + } /* One child is hit, continue with that child. */ int r = __bscf(traverseChild); @@ -361,13 +394,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); ++stackPtr; @@ -398,13 +436,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; @@ -417,3 +460,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, return (isect->prim != PRIM_NONE); } + +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h index d66c6e2b1e5..da21ede9e12 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume.h +++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h @@ -26,6 +26,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect, @@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, isect->object = OBJECT_NONE; ssef tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -104,22 +114,35 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int traverseChild = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + } /* One child is hit, continue with that child. */ int r = __bscf(traverseChild); @@ -278,13 +301,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); ++stackPtr; @@ -319,13 +347,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; @@ -337,3 +370,5 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, return (isect->prim != PRIM_NONE); } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h index 89950f17c64..8a31775fae3 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h +++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h @@ -26,6 +26,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, @@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -108,22 +118,35 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int traverseChild = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7); + } /* One child is hit, continue with that child. */ int r = __bscf(traverseChild); @@ -330,12 +353,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -389,13 +417,18 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); isect_t = tmax; isect_array->t = isect_t; @@ -409,3 +442,5 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, return num_hits; } + +#undef NODE_INTERSECT |