diff options
Diffstat (limited to 'intern/cycles/kernel')
35 files changed, 3028 insertions, 1732 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index f0adbc03e22..bd3969b2889 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -28,6 +28,22 @@ set(SRC kernels/cuda/kernel.cu ) +set(SRC_BVH_HEADERS + bvh/bvh.h + bvh/bvh_nodes.h + bvh/bvh_shadow_all.h + bvh/bvh_subsurface.h + bvh/bvh_traversal.h + bvh/bvh_volume.h + bvh/bvh_volume_all.h + bvh/qbvh_nodes.h + bvh/qbvh_shadow_all.h + bvh/qbvh_subsurface.h + bvh/qbvh_traversal.h + bvh/qbvh_volume.h + bvh/qbvh_volume_all.h +) + set(SRC_HEADERS kernel_accumulate.h kernel_bake.h @@ -140,23 +156,11 @@ set(SRC_SVM_HEADERS set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h - geom/geom_bvh.h - geom/geom_bvh_shadow.h - geom/geom_bvh_subsurface.h - geom/geom_bvh_traversal.h - geom/geom_bvh_volume.h - geom/geom_bvh_volume_all.h geom/geom_curve.h geom/geom_motion_curve.h geom/geom_motion_triangle.h geom/geom_object.h geom/geom_primitive.h - geom/geom_qbvh.h - geom/geom_qbvh_shadow.h - geom/geom_qbvh_subsurface.h - geom/geom_qbvh_traversal.h - geom/geom_qbvh_volume.h - geom/geom_qbvh_volume_all.h geom/geom_triangle.h geom/geom_triangle_intersect.h geom/geom_volume.h @@ -212,7 +216,14 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) + set(cuda_sources kernels/cuda/kernel.cu + ${SRC_HEADERS} + ${SRC_BVH_HEADERS} + ${SRC_SVM_HEADERS} + ${SRC_GEOM_HEADERS} + ${SRC_CLOSURE_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) macro(CYCLES_CUDA_KERNEL_ADD arch experimental) @@ -312,6 +323,7 @@ add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} @@ -346,6 +358,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteratio delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/bvh/bvh.h index d0eedd3396a..59881738195 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -35,6 +35,13 @@ CCL_NAMESPACE_BEGIN # define ccl_device_intersect ccl_device_inline #endif +/* bottom-most stack entry, indicating the end of traversal */ +#define ENTRYPOINT_SENTINEL 0x76543210 + +/* 64 object BVH + 64 mesh BVH + 64 object node splitting */ +#define BVH_STACK_SIZE 192 +#define BVH_QSTACK_SIZE 384 + /* BVH intersection function variations */ #define BVH_INSTANCING 1 @@ -72,71 +79,73 @@ CCL_NAMESPACE_BEGIN /* Common QBVH functions. */ #ifdef __QBVH__ -# include "geom_qbvh.h" +# include "qbvh_nodes.h" #endif /* Regular BVH traversal */ +#include "bvh_nodes.h" + #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 -#include "geom_bvh_traversal.h" +#include "bvh_traversal.h" #if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif #if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif #if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif #if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif /* Subsurface scattering BVH traversal */ #if defined(__SUBSURFACE__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface -# define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_subsurface.h" +# define BVH_FUNCTION_FEATURES BVH_HAIR +# include "bvh_subsurface.h" #endif #if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion -# define BVH_FUNCTION_FEATURES BVH_MOTION -# include "geom_bvh_subsurface.h" +# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR +# include "bvh_subsurface.h" #endif /* Volume BVH traversal */ #if defined(__VOLUME__) # define BVH_FUNCTION_NAME bvh_intersect_volume -# define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_volume.h" +# define BVH_FUNCTION_FEATURES BVH_HAIR +# include "bvh_volume.h" #endif #if defined(__VOLUME__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_volume.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# include "bvh_volume.h" #endif #if defined(__VOLUME__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_volume.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR +# include "bvh_volume.h" #endif /* Record all intersections - Shadow BVH traversal */ @@ -144,51 +153,51 @@ CCL_NAMESPACE_BEGIN #if defined(__SHADOW_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all # define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif /* Record all intersections - Volume BVH traversal */ #if defined(__VOLUME_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all -# define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_volume_all.h" +# define BVH_FUNCTION_FEATURES BVH_HAIR +# include "bvh_volume_all.h" #endif #if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_volume_all.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# include "bvh_volume_all.h" #endif #if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_volume_all.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR +# include "bvh_volume_all.h" #endif #undef BVH_FEATURE diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h new file mode 100644 index 00000000000..db2275b0ff8 --- /dev/null +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -0,0 +1,656 @@ +/* + * Copyright 2011-2016, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and +// 3-vector which might be faster. +ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, + int node_addr, + int child) +{ + Transform space; + const int child_addr = node_addr + child * 3; + space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1); + space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2); + space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3); + space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + return space; +} + +#if !defined(__KERNEL_SSE2__) +ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) +{ + + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + dist[0] = c0min; + dist[1] = c1min; + +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); +#else + return ((c0max >= c0min)? 1: 0) | + ((c1max >= c1min)? 2: 0); +#endif +} + +ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + if(difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + dist[0] = c0min; + dist[1] = c1min; + +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); +#else + return ((c0max >= c0min)? 1: 0) | + ((c1max >= c1min)? 2: 0); +#endif +} + +ccl_device_inline bool bvh_unaligned_node_intersect_child( + KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + int node_addr, + int child, + float dist[2]) +{ + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 lower_xyz = aligned_P * nrdir; + float3 upper_xyz = lower_xyz - nrdir; + const float near_x = min(lower_xyz.x, upper_xyz.x); + const float near_y = min(lower_xyz.y, upper_xyz.y); + const float near_z = min(lower_xyz.z, upper_xyz.z); + const float far_x = max(lower_xyz.x, upper_xyz.x); + const float far_y = max(lower_xyz.y, upper_xyz.y); + const float far_z = max(lower_xyz.z, upper_xyz.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + return tnear <= tfar; +} + +ccl_device_inline bool bvh_unaligned_node_intersect_child_robust( + KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + const float difl, + int node_addr, + int child, + float dist[2]) +{ + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 tLowerXYZ = aligned_P * nrdir; + float3 tUpperXYZ = tLowerXYZ - nrdir; + const float near_x = min(tLowerXYZ.x, tUpperXYZ.x); + const float near_y = min(tLowerXYZ.y, tUpperXYZ.y); + const float near_z = min(tLowerXYZ.z, tUpperXYZ.z); + const float far_x = max(tLowerXYZ.x, tUpperXYZ.x); + const float far_y = max(tLowerXYZ.y, tUpperXYZ.y); + const float far_z = max(tLowerXYZ.z, tUpperXYZ.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + if(difl != 0.0f) { + /* TODO(sergey): Same as for QBVH, needs a proper use. */ + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + return round_down*tnear <= round_up*tfar; + } + else { + return tnear <= tfar; + } +} + +ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) +{ + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.x) & visibility)) +#endif + { + mask |= 1; + } + } + if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.y) & visibility)) +#endif + { + mask |= 2; + } + } + return mask; +} + +ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.x) & visibility)) +#endif + { + mask |= 1; + } + } + if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.y) & visibility)) +#endif + { + mask |= 2; + } + } + return mask; +} + +ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, + P, + dir, + idir, + t, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect(kg, + P, + idir, + t, + node_addr, + visibility, + dist); + } +} + +ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust(kg, + P, + dir, + idir, + t, + difl, + extmax, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + idir, + t, + difl, + extmax, + node_addr, + visibility, + dist); + } +} +#else /* !defined(__KERNEL_SSE2__) */ + +int ccl_device_inline bvh_aligned_node_intersect( + KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) +{ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_aligned_node_intersect_robust( + KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + if(difl != 0.0f) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + float4 *tminmaxview = (float4*)&tminmax; + float& c0min = tminmaxview->x, &c1min = tminmaxview->y; + float& c0max = tminmaxview->z, &c1max = tminmaxview->w; + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const int node_addr, + const uint visibility, + float dist[2]) +{ + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir);; + float3 aligned_P0 = transform_point(&space0, P), + aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, + aligned_P1.x * nrdir1.x, + 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, + aligned_P1.y * nrdir1.y, + 0.0f, + 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, + aligned_P1.z * nrdir1.z, + 0.0f, + 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + sseb vmask = tnear <= tfar; + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const float difl, + const int node_addr, + const uint visibility, + float dist[2]) +{ + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir);; + float3 aligned_P0 = transform_point(&space0, P), + aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, + aligned_P1.x * nrdir1.x, + 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, + aligned_P1.y * nrdir1.y, + 0.0f, + 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, + aligned_P1.z * nrdir1.z, + 0.0f, + 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + sseb vmask; + if(difl != 0.0f) { + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + vmask = round_down*tnear <= round_up*tfar; + } + else { + vmask = tnear <= tfar; + } + + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, + P, + dir, + isect_near, + isect_far, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); + } +} + +ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust(kg, + P, + dir, + isect_near, + isect_far, + difl, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); + } +} +#endif /* !defined(__KERNEL_SSE2__) */ diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 4005489f77d..1869457f0c3 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -# include "geom_qbvh_shadow.h" +# include "qbvh_shadow_all.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function, where various features can be @@ -41,14 +47,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, * - likely and unlikely for if() statements * - test restrict attribute for pointers */ - + /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ const float tmax = ray->t; @@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -93,130 +102,87 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_ahild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); -# else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, # endif - + idir, + isect_t, + node_addr, + PATH_RAY_SHADOW, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); -# else - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, # endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + PATH_RAY_SHADOW, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + node_addr = __float_as_int(cnodes.z); + node_addr_ahild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif - - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_ahild1; + node_addr_ahild1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_ahild1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_ahild1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ - while(primAddr < primAddr2) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + while(prim_addr < prim_addr2) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); bool hit; @@ -226,22 +192,57 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr); + hit = triangle_intersect(kg, + &isect_precalc, + isect_array, + P, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr); + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #endif #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); - else - hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } + else { + hit = bvh_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } break; } #endif @@ -253,6 +254,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* shadow ray early termination */ if(hit) { + /* Update number of hits now, so we do proper check on max bounces. */ + (*num_hits)++; + /* detect if this surface has a shader with transparent shadows */ /* todo: optimize so primitive visibility flag indicates if @@ -283,23 +287,20 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, return true; } - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - - isect_array->t = isect_t; + /* Move on to next entry in intersections array */ + isect_array++; } - primAddr++; + prim_addr++; } } #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); @@ -317,21 +318,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); if(num_hits_in_instance) { @@ -369,15 +373,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return false; } @@ -410,3 +417,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index 915e9415c93..18978efcfa3 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -# include "geom_qbvh_subsurface.h" +# include "qbvh_subsurface.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for subsurface scattering, where @@ -44,12 +50,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, */ /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object); + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, subsurface_object); /* ray parameters in registers */ float3 P = ray->P; @@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -100,127 +109,94 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* traversal loop */ do { - do - { + do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) - { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect_t, + node_addr, + PATH_RAY_ALL_VISIBILITY, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + PATH_RAY_ALL_VISIBILITY, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, &isect_precalc, ss_isect, P, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -230,15 +206,15 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); motion_triangle_intersect_subsurface(kg, ss_isect, P, dir, ray->time, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -251,8 +227,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, } } } - } while(nodeAddr != ENTRYPOINT_SENTINEL); - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); } ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -286,3 +262,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index ae919ef3f86..68a11b65ad7 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -18,7 +18,15 @@ */ #ifdef __QBVH__ -# include "geom_qbvh_traversal.h" +# include "qbvh_traversal.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust +#else +# define NODE_INTERSECT bvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust #endif /* This is a template BVH traversal function, where various features can be @@ -49,14 +57,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, * - likely and unlikely for if() statements * - test restrict attribute for pointers */ - + /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ float3 P = ray->P; @@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect->t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -100,174 +111,148 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + difl, + extmax, + node_addr, + visibility, + dist); } + else # endif - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); -# else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); -# endif - + { + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + node_addr, + visibility, + dist); + } #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { - float4 *tminmaxview = (float4*)&tminmax; - float &c0min = tminmaxview->x, &c1min = tminmaxview->y; - float &c0max = tminmaxview->z, &c1max = tminmaxview->w; - - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); } + else # endif - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); -# else - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); -# endif + { + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); + } #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } BVH_DEBUG_NEXT_STEP(); } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { + for(; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect(kg, + &isect_precalc, + isect, + P, + visibility, + object, + prim_addr)) + { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif #else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -278,15 +263,26 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { + for(; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr)) + { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif # else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -299,20 +295,47 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - for(; primAddr < primAddr2; primAddr++) { + for(; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); - else - hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } + else { + hit = bvh_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } if(hit) { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif # else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -327,7 +350,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); @@ -342,24 +365,27 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); BVH_DEBUG_NEXT_INSTANCE(); } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* instance pop */ @@ -376,16 +402,19 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); } @@ -433,3 +462,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index f3edf85d723..03499e94347 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -#include "geom_qbvh_volume.h" +# include "qbvh_volume.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for volumes, where @@ -43,12 +49,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, */ /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ float3 P = ray->P; @@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect->t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -90,143 +99,124 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr); + triangle_intersect(kg, + &isect_precalc, + isect, + P, + visibility, + object, + prim_addr); } break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr); } break; } @@ -239,7 +229,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -258,29 +248,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* pop */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* instance pop */ @@ -298,16 +291,19 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_MOTION) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); } @@ -337,3 +333,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index ec837212471..b5405e8e57b 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -#include "geom_qbvh_volume_all.h" +# include "qbvh_volume_all.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for volumes, where @@ -44,12 +50,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, */ /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ const float tmax = ray->t; @@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -94,137 +103,109 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_array->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect_t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif - - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); bool hit; /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr); + hit = triangle_intersect(kg, + &isect_precalc, + isect_array, + P, + visibility, + object, + prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - isect_array->t = isect_t; if(num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -239,6 +220,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; @@ -246,23 +230,28 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr); + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + visibility, + object, + prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; # if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; # endif - isect_array->t = isect_t; if(num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -277,6 +266,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; @@ -290,7 +282,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -311,29 +303,32 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* pop */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); if(num_hits_in_instance) { @@ -368,16 +363,19 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_MOTION) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return num_hits; } @@ -410,3 +408,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h new file mode 100644 index 00000000000..4d8695bedec --- /dev/null +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -0,0 +1,433 @@ +/* + * Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +struct QBVHStackItem { + int addr; + float dist; +}; + +/* TOOD(sergey): Investigate if using intrinsics helps for both + * stack item swap and float comparison. + */ +ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, + QBVHStackItem *ccl_restrict b) +{ + QBVHStackItem tmp = *a; + *a = *b; + *b = tmp; +} + +ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, + QBVHStackItem *ccl_restrict s2, + QBVHStackItem *ccl_restrict s3) +{ + if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } + if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } + if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } +} + +ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, + QBVHStackItem *ccl_restrict s2, + QBVHStackItem *ccl_restrict s3, + QBVHStackItem *ccl_restrict s4) +{ + if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } + if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); } + if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); } + if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); } + if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } +} + +/* Axis-aligned nodes intersection */ + +ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#else + const sse3f& org, +#endif + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) +{ + const int offset = node_addr + 1; +#ifdef __KERNEL_AVX2__ + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); +#else + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; +#endif + +#ifdef __KERNEL_SSE41__ + const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near)); + const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far)); + const sseb vmask = cast(tnear) > cast(tfar); + int mask = (int)movemask(vmask)^0xf; +#else + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const sseb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); +#endif + *dist = tnear; + return mask; +} + +ccl_device_inline int qbvh_aligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#else + const sse3f& P, +#endif + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) +{ + const int offset = node_addr + 1; +#ifdef __KERNEL_AVX2__ + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); +#else + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; +#endif + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const sseb vmask = round_down*tnear <= round_up*tfar; + *dist = tnear; + return (int)movemask(vmask); +} + +/* Unaligned nodes intersection */ + +ccl_device_inline int qbvh_unaligned_node_intersect( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#endif + const sse3f& org, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + + const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, + aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, + aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; + + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const ssef tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const ssef tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + +#ifdef __KERNEL_SSE41__ + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); +#else + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); +#endif +} + +ccl_device_inline int qbvh_unaligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#endif + const sse3f& P, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + + const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, + aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, + aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; + + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const ssef tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const ssef tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + +#ifdef __KERNEL_SSE41__ + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); +#else + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); +#endif + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = round_down*tnear <= round_up*tfar; + *dist = tnear; + return movemask(vmask); +} + +/* Intersectors wrappers. + * + * They'll check node type and call appropriate intersection code. + */ + +ccl_device_inline int qbvh_node_intersect( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#endif + const sse3f& org, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#endif + org, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } + else { + return qbvh_aligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#else + org, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } +} + +ccl_device_inline int qbvh_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#endif + const sse3f& P, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#endif + P, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } + else { + return qbvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#else + P, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index edb5b5c78c3..34753ff067d 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -27,6 +27,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, @@ -39,12 +45,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* Ray parameters in registers. */ const float tmax = ray->t; @@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(tmax); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -96,29 +106,53 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -127,24 +161,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -152,86 +186,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); #ifdef __VISIBILITY_FLAG__ if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } #endif - int primAddr = __float_as_int(leaf.x); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ - while(primAddr < primAddr2) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + while(prim_addr < prim_addr2) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); bool hit; @@ -241,22 +275,57 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr); + hit = triangle_intersect(kg, + &isect_precalc, + isect_array, + P, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr); + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #endif #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); - else - hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } + else { + hit = bvh_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } break; } #endif @@ -268,6 +337,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Shadow ray early termination. */ if(hit) { + /* Update number of hits now, so we do proper check on max bounces. */ + (*num_hits)++; + /* detect if this surface has a shader with transparent shadows */ /* todo: optimize so primitive visibility flag indicates if @@ -298,23 +370,21 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, return true; } - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - + /* Move on to next entry in intersections array */ + isect_array++; isect_array->t = isect_t; } - primAddr++; + prim_addr++; } } #if BVH_FEATURE(BVH_INSTANCING) else { /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); @@ -329,28 +399,33 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); if(num_hits_in_instance) { @@ -383,21 +458,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(tmax); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return false; } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h index 84512a8783c..03794e3a882 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h +++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h @@ -25,6 +25,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, SubsurfaceIntersection *ss_isect, @@ -41,12 +47,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object); + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, subsurface_object); /* Ray parameters in registers. */ float3 P = ray->P; @@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -106,29 +116,43 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + if(child_mask != 0) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -137,24 +161,24 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -162,82 +186,82 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* Intersect ray against primitive, */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, &isect_precalc, ss_isect, P, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -247,15 +271,15 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* Intersect ray against primitive. */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); motion_triangle_intersect_subsurface(kg, ss_isect, P, dir, ray->time, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -267,6 +291,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, break; } } - } while(nodeAddr != ENTRYPOINT_SENTINEL); - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h new file mode 100644 index 00000000000..f82ff661495 --- /dev/null +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -0,0 +1,505 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + ,uint *lcg_state, + float difl, + float extmax +#endif + ) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps (for non shadow rays). + * - Separate version for shadow rays. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + traversal_stack[0].dist = -FLT_MAX; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + float node_dist = -FLT_MAX; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + BVH_DEBUG_INIT(); + + ssef tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + + if(UNLIKELY(node_dist > isect->t) +#ifdef __VISIBILITY_FLAG__ + || (__float_as_uint(inodes.x) & visibility) == 0) +#endif + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + int child_mask; + ssef dist; + + BVH_DEBUG_NEXT_STEP(); + +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + child_mask = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, +# ifdef __KERNEL_AVX2__ + P_idir4, +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + &dist); + } + else +#endif /* BVH_HAIR_MINIMUM_WIDTH */ + { + child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + } + + if(child_mask != 0) { + float4 cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here), + */ +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + float d0 = ((float*)&dist)[r]; + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + node_dist = d0; + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + node_dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + node_dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } + + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + +#ifdef __VISIBILITY_FLAG__ + if(UNLIKELY((node_dist > isect->t) || + ((__float_as_uint(leaf.z) & visibility) == 0))) +#else + if(UNLIKELY((node_dist > isect->t))) +#endif + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + + /* Primitive intersection. */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_STEP(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect(kg, + &isect_precalc, + isect, + P, + visibility, + object, + prim_addr)) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_STEP(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr)) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_STEP(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + bool hit; + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } + else { + hit = bvh_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } + if(hit) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + +# if BVH_FEATURE(BVH_MOTION) + qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); +# else + qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); +# endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + + triangle_intersect_precalc(dir, &isect_precalc); + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + traversal_stack[stack_ptr].dist = -FLT_MAX; + + node_addr = kernel_tex_fetch(__object_node, object); + + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); +# else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); +# endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + + triangle_intersect_precalc(dir, &isect_precalc); + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index ab2e530dd20..b4f334eb842 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -26,6 +26,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect, @@ -38,12 +44,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* Ray parameters in registers. */ float3 P = ray->P; @@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, isect->object = OBJECT_NONE; ssef tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -92,29 +102,52 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { +#ifdef __VISIBILITY_FLAG__ + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -123,24 +156,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -148,102 +181,102 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ switch(p_type) { case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr); + triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr); } break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); } break; } @@ -253,7 +286,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -268,34 +301,39 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* Pop. */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* Instance pop. */ @@ -309,21 +347,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index 5546471b0e3..a877e5bb341 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -26,6 +26,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, @@ -39,12 +45,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* Ray parameters in registers. */ const float tmax = ray->t; @@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -96,29 +106,52 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { +#ifdef __VISIBILITY_FLAG__ + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -127,24 +160,24 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -152,96 +185,94 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; bool hit; /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ switch(p_type) { case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr); + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - isect_array->t = isect_t; if(num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -256,30 +287,31 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr); + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; # if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; # endif - isect_array->t = isect_t; if(num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -294,6 +326,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, # endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; @@ -304,7 +339,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -320,35 +355,40 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* Pop. */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* Instance pop. */ @@ -379,23 +419,30 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); isect_t = tmax; isect_array->t = isect_t; object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return num_hits; } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index c94a5384d1f..d2c7edb11ea 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -15,27 +15,6 @@ * limitations under the License. */ -/* bottom-most stack entry, indicating the end of traversal */ -#define ENTRYPOINT_SENTINEL 0x76543210 - -/* 64 object BVH + 64 mesh BVH + 64 object node splitting */ -#define BVH_STACK_SIZE 192 -#define BVH_QSTACK_SIZE 384 -#define BVH_NODE_SIZE 4 -#define BVH_NODE_LEAF_SIZE 1 -#define BVH_QNODE_SIZE 7 -#define BVH_QNODE_LEAF_SIZE 1 -#define TRI_NODE_SIZE 3 - -/* silly workaround for float extended precision that happens when compiling - * without sse support on x86, it results in different results for float ops - * that you would otherwise expect to compare correctly */ -#if !defined(__i386__) || defined(__SSE__) -# define NO_EXTENDED_PRECISION -#else -# define NO_EXTENDED_PRECISION volatile -#endif - #include "geom_attribute.h" #include "geom_object.h" #include "geom_triangle.h" @@ -45,5 +24,4 @@ #include "geom_curve.h" #include "geom_volume.h" #include "geom_primitive.h" -#include "geom_bvh.h" diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 8894843997c..292e1bfca0e 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -450,8 +450,8 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect else if(level == 1) { /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ float t = isect->t; float u = 0.0f; float gd = 0.0f; diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index ffe55529110..2fb8e219884 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -47,13 +47,13 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg, int object, uint return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z; } -ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3]) +ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3]) { if(step == numsteps) { /* center step: regular vertex location */ - verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + verts[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + verts[1] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + verts[2] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); } else { /* center step not store in this array */ @@ -62,19 +62,19 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3 offset += step*numverts; - verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); + verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); + verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); } } -ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3]) +ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3]) { if(step == numsteps) { /* center step: regular vertex location */ - normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); - normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); - normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z))); + normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); + normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); + normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); } else { /* center step not stored in this array */ @@ -83,9 +83,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float offset += step*numverts; - normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); + normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); + normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); } } @@ -107,7 +107,7 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i /* fetch vertex coordinates */ float3 next_verts[3]; - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); @@ -259,7 +259,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD /* fetch vertex coordinates */ float3 verts[3], next_verts[3]; - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim))); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h deleted file mode 100644 index 2a2d7822eee..00000000000 --- a/intern/cycles/kernel/geom/geom_qbvh.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2011-2014, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -struct QBVHStackItem { - int addr; - float dist; -}; - -/* TOOD(sergey): Investigate if using intrinsics helps for both - * stack item swap and float comparison. - */ -ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a, - QBVHStackItem *__restrict b) -{ - QBVHStackItem tmp = *a; - *a = *b; - *b = tmp; -} - -ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1, - QBVHStackItem *__restrict s2, - QBVHStackItem *__restrict s3) -{ - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } - if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } -} - -ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1, - QBVHStackItem *__restrict s2, - QBVHStackItem *__restrict s3, - QBVHStackItem *__restrict s4) -{ - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } - if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); } - if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); } - if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); } - if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } -} - -ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg, - const ssef& tnear, - const ssef& tfar, -#ifdef __KERNEL_AVX2__ - const sse3f& org_idir, -#else - const sse3f& org, -#endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int nodeAddr, - ssef *__restrict dist) -{ - const int offset = nodeAddr*BVH_QNODE_SIZE; -#ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); - const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); - const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); -#else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; -#endif - -#ifdef __KERNEL_SSE41__ - const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear)); - const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar)); - const sseb vmask = cast(tNear) > cast(tFar); - int mask = (int)movemask(vmask)^0xf; -#else - const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); - const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); - const sseb vmask = tNear <= tFar; - int mask = (int)movemask(vmask); -#endif - *dist = tNear; - return mask; -} - -ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg, - const ssef& tnear, - const ssef& tfar, -#ifdef __KERNEL_AVX2__ - const sse3f& P_idir, -#else - const sse3f& P, -#endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int nodeAddr, - const float difl, - ssef *__restrict dist) -{ - const int offset = nodeAddr*BVH_QNODE_SIZE; -#ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); - const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); - const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); -#else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; -#endif - - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); - const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); - const sseb vmask = round_down*tNear <= round_up*tFar; - *dist = tNear; - return (int)movemask(vmask); -} diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h deleted file mode 100644 index 738d08ac6fc..00000000000 --- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Adapted from code Copyright 2009-2010 NVIDIA Corporation, - * and code copyright 2009-2012 Intel Corporation - * - * Modifications Copyright 2011-2014, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width - * BVH_MOTION: motion blur rendering - * - */ - -ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility -#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - ,uint *lcg_state, - float difl, - float extmax -#endif - ) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps (for non shadow rays). - * - Separate version for shadow rays. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; - traversalStack[0].dist = -FLT_MAX; - - /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; - float nodeDist = -FLT_MAX; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; - - BVH_DEBUG_INIT(); - - ssef tnear(0.0f), tfar(ray->t); - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - - if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } - if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } - if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } - - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - if(UNLIKELY(nodeDist > isect->t)) { - /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - continue; - } - - int traverseChild; - ssef dist; - - BVH_DEBUG_NEXT_STEP(); - -#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - /* NOTE: We extend all the child BB instead of fetching - * and checking visibility flags for each of the, - * - * Need to test if doing opposite would be any faster. - */ - traverseChild = qbvh_node_intersect_robust(kg, - tnear, - tfar, -# ifdef __KERNEL_AVX2__ - P_idir4, -# else - org, -# endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - difl, - &dist); - } - else -#endif /* BVH_HAIR_MINIMUM_WIDTH */ - { - traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, -#endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - } - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); - - /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - float d0 = ((float*)&dist)[r]; - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); - nodeDist = d0; - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - r = __bscf(traverseChild); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { - if(d1 < d0) { - nodeAddr = c1; - nodeDist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; - continue; - } - else { - nodeAddr = c0; - nodeDist = d0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(traverseChild); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(traverseChild); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); - } - - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - } - - /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - -#ifdef __VISIBILITY_FLAG__ - if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) -#else - if(UNLIKELY((nodeDist > isect->t))) -#endif - { - /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - continue; - } - - int primAddr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { -#endif - int primAddr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - - /* Primitive intersection. */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ -#if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for(; primAddr < primAddr2; primAddr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); - else - hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); - if(hit) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); - -# if BVH_FEATURE(BVH_MOTION) - qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_itfm); -# else - qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist); -# endif - - if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } - if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } - if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } - tfar = ssef(isect->t); - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - triangle_intersect_precalc(dir, &isect_precalc); - - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; - traversalStack[stackPtr].dist = -FLT_MAX; - - nodeAddr = kernel_tex_fetch(__object_node, object); - - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); -# else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); -# endif - - if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } - if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } - if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } - tfar = ssef(isect->t); - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - triangle_intersect_precalc(dir, &isect_precalc); - - object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); - - return (isect->prim != PRIM_NONE); -} diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 995dfac5b09..0c2351e1d1b 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -27,12 +27,11 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); - /* return normal */ if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) return normalize(cross(v2 - v0, v1 - v0)); @@ -44,11 +43,10 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* compute point */ float t = 1.0f - u - v; @@ -71,11 +69,10 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3]) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + P[1] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + P[2] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); } /* Interpolate smooth vertex normal from vertices */ @@ -83,11 +80,10 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); - float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); - float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); + float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); + float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); } @@ -97,11 +93,10 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv) { /* fetch triangle vertex coordinates */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + const float3 p0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + const float3 p1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + const float3 p2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* compute derivatives of P w.r.t. uv */ *dPdu = (p0 - p2); @@ -119,11 +114,11 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim)); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); - float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); - float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); - float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z)); + float f0 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.x); + float f1 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.y); + float f2 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.z); #ifdef __RAY_DIFFERENTIALS__ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; @@ -162,11 +157,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim))); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); + float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); + float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index b6dfc769012..fc081bda525 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -106,9 +106,10 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - const float4 tri_a = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); @@ -202,9 +203,10 @@ ccl_device_inline void triangle_intersect_subsurface( const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - const float4 tri_a = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); @@ -324,9 +326,10 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, P = P + D*t; - const float4 tri_a = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); @@ -381,9 +384,10 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, P = P + D*t; #ifdef __INTERSECTION_REFINE__ - const float4 tri_a = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 42314756f02..08f6f457805 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -42,6 +42,7 @@ #define ccl_constant #define ccl_may_alias #define ccl_addr_space +#define ccl_restrict __restrict__ /* No assert supported for CUDA */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index a5708448e23..8505cb85576 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -39,6 +39,7 @@ #define ccl_global __global #define ccl_local __local #define ccl_private __private +#define ccl_restrict restrict #ifdef __SPLIT_KERNEL__ # define ccl_addr_space __global diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 736a884f819..93c4bd3f7d5 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -51,8 +51,8 @@ ccl_device float area_light_sample(float3 P, bool sample_coord) { /* In our name system we're using P for the center, - * which is o in the paper. - */ + * which is o in the paper. + */ float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f; float axisu_len, axisv_len; diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 3c3503eab8b..d5b31037723 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -25,6 +25,7 @@ #include "kernel_camera.h" #include "geom/geom.h" +#include "bvh/bvh.h" #include "kernel_accumulate.h" #include "kernel_shader.h" diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 94598e2565e..731dc0407c5 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -309,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, const ccl_addr_space PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) { return lcg_init(*rng + state->rng_offset + state->sample*scramble); } diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 245d236ff97..5ba262c1044 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -25,7 +25,8 @@ /* bvh */ KERNEL_TEX(float4, texture_float4, __bvh_nodes) KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes) -KERNEL_TEX(float4, texture_float4, __tri_storage) +KERNEL_TEX(float4, texture_float4, __prim_tri_verts) +KERNEL_TEX(uint, texture_uint, __prim_tri_index) KERNEL_TEX(uint, texture_uint, __prim_type) KERNEL_TEX(uint, texture_uint, __prim_visibility) KERNEL_TEX(uint, texture_uint, __prim_index) @@ -39,8 +40,7 @@ KERNEL_TEX(float4, texture_float4, __objects_vector) /* triangles */ KERNEL_TEX(uint, texture_uint, __tri_shader) KERNEL_TEX(float4, texture_float4, __tri_vnormal) -KERNEL_TEX(float4, texture_float4, __tri_vindex) -KERNEL_TEX(float4, texture_float4, __tri_verts) +KERNEL_TEX(uint4, texture_uint4, __tri_vindex) /* curves */ KERNEL_TEX(float4, texture_float4, __curves) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 76d2a6b98e6..5de58ba28ed 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -292,11 +292,14 @@ enum PathRayFlag { PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024), + /* Special flag to tag unaligned BVH nodes. */ + PATH_RAY_NODE_UNALIGNED = 2048, - PATH_RAY_MIS_SKIP = 2048, - PATH_RAY_DIFFUSE_ANCESTOR = 4096, - PATH_RAY_SINGLE_PASS_DONE = 8192, + PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + + PATH_RAY_MIS_SKIP = 4096, + PATH_RAY_DIFFUSE_ANCESTOR = 8192, + PATH_RAY_SINGLE_PASS_DONE = 16384, }; /* Closure Label */ @@ -769,7 +772,7 @@ typedef ccl_addr_space struct ShaderData { int type; /* parametric coordinates - * - barycentric weights for triangles */ + * - barycentric weights for triangles */ float u; float v; /* object id if there is one, ~0 otherwise */ @@ -792,14 +795,14 @@ typedef ccl_addr_space struct ShaderData { #endif #ifdef __DPDU__ /* differential of P w.r.t. parametric coordinates. note that dPdu is - * not readily suitable as a tangent for shading on triangles. */ + * not readily suitable as a tangent for shading on triangles. */ float3 dPdu; float3 dPdv; #endif #ifdef __OBJECT_MOTION__ /* object <-> world space transformations, cached to avoid - * re-interpolating them constantly for shading */ + * re-interpolating them constantly for shading */ Transform ob_tfm; Transform ob_itfm; #endif @@ -1171,11 +1174,11 @@ typedef ccl_addr_space struct DebugData { #define QUEUE_EMPTY_SLOT -1 /* -* Queue 1 - Active rays -* Queue 2 - Background queue -* Queue 3 - Shadow ray cast kernel - AO -* Queeu 4 - Shadow ray cast kernel - direct lighting -*/ + * Queue 1 - Active rays + * Queue 2 - Background queue + * Queue 3 - Shadow ray cast kernel - AO + * Queeu 4 - Shadow ray cast kernel - direct lighting + */ #define NUM_QUEUES 4 /* Queue names */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index aad06ed5c76..37907cd8fdc 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -35,6 +35,7 @@ # include "../../kernel_montecarlo.h" # include "../../kernel_projection.h" # include "../../geom/geom.h" +# include "../../bvh/bvh.h" # include "../../kernel_accumulate.h" # include "../../kernel_camera.h" diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index ebe739ebd0e..2bb2be5e6b3 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -47,6 +47,7 @@ #include "kernel_camera.h" #include "kernels/cpu/kernel_cpu_image.h" #include "geom/geom.h" +#include "bvh/bvh.h" #include "kernel_projection.h" #include "kernel_accumulate.h" @@ -912,7 +913,7 @@ bool OSLRenderServices::texture(ustring filename, #endif bool status; - if(filename[0] == '@') { + if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); @@ -993,7 +994,7 @@ bool OSLRenderServices::texture3d(ustring filename, } bool status; - if(filename[0] == '@') { + if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z); diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index 49030f33c26..b43f8402d42 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -81,6 +81,7 @@ set(SRC_OSL node_wireframe.osl node_hair_bsdf.osl node_uv_map.osl + node_rgb_to_bw.osl ) set(SRC_OSL_HEADERS diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl index a00401845c8..7cd2922dd4f 100644 --- a/intern/cycles/kernel/shaders/node_image_texture.osl +++ b/intern/cycles/kernel/shaders/node_image_texture.osl @@ -88,7 +88,7 @@ shader node_image_texture( string color_space = "sRGB", string projection = "flat", string interpolation = "smartcubic", - string wrap = "periodic", + string extension = "periodic", float projection_blend = 0.0, int is_float = 1, int use_alpha = 1, @@ -108,7 +108,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); } else if (projection == "box") { /* object space normal */ @@ -184,7 +184,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); Alpha += weight[0] * tmp_alpha; } if (weight[1] > 0.0) { @@ -195,7 +195,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); Alpha += weight[1] * tmp_alpha; } if (weight[2] > 0.0) { @@ -206,7 +206,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); Alpha += weight[2] * tmp_alpha; } } @@ -219,7 +219,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); } else if (projection == "tube") { point projected = map_to_tube(texco_remap_square(p)); @@ -230,6 +230,6 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); } } diff --git a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl new file mode 100644 index 00000000000..903dfcdc881 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl @@ -0,0 +1,25 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" + +shader node_rgb_to_bw( + color Color = 0.0, + output float Val = 0.0) +{ + Val = Color[0] * 0.2126 + Color[1] * 0.7152 + Color[2] * 0.0722; +} + diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index e1c7e2cea99..88d6dab04d0 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -31,6 +31,7 @@ #include "kernel_camera.h" #include "geom/geom.h" +#include "bvh/bvh.h" #include "kernel_accumulate.h" #include "kernel_shader.h" diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index aa9c07c867e..44732734c31 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -72,8 +72,16 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint width = info.x; uint height = info.y; uint offset = info.z; - uint periodic = (info.w & 0x1); - uint interpolation = info.w >> 1; + + /* Image Options */ + uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; + uint extension; + if(info.w & (1 << 1)) + extension = EXTENSION_REPEAT; + else if(info.w & (1 << 2)) + extension = EXTENSION_EXTEND; + else + extension = EXTENSION_CLIP; float4 r; int ix, iy, nix, niy; @@ -81,22 +89,26 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, svm_image_texture_frac(x*width, &ix); svm_image_texture_frac(y*height, &iy); - if(periodic) { + if(extension == EXTENSION_REPEAT) { ix = svm_image_texture_wrap_periodic(ix, width); iy = svm_image_texture_wrap_periodic(iy, height); } - else { + else if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else { /* EXTENSION_EXTEND */ ix = svm_image_texture_wrap_clamp(ix, width); iy = svm_image_texture_wrap_clamp(iy, height); - } + r = svm_image_texture_read(kg, id, offset + ix + iy*width); } - else { /* We default to linear interpolation if it is not closest */ + else { /* INTERPOLATION_LINEAR */ float tx = svm_image_texture_frac(x*width - 0.5f, &ix); float ty = svm_image_texture_frac(y*height - 0.5f, &iy); - if(periodic) { + if(extension == EXTENSION_REPEAT) { ix = svm_image_texture_wrap_periodic(ix, width); iy = svm_image_texture_wrap_periodic(iy, height); @@ -104,14 +116,17 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, niy = svm_image_texture_wrap_periodic(iy+1, height); } else { - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } nix = svm_image_texture_wrap_clamp(ix+1, width); niy = svm_image_texture_wrap_clamp(iy+1, height); + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); } - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width); r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width); r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); |