diff options
Diffstat (limited to 'intern/cycles/kernel/bvh')
-rw-r--r-- | intern/cycles/kernel/bvh/bvh.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_local.h | 12 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_shadow_all.h | 12 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_traversal.h | 16 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_types.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_volume.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_volume_all.h | 11 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_local.h | 402 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_nodes.h | 532 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_shadow_all.h | 687 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_traversal.h | 642 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_volume.h | 483 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_volume_all.h | 554 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/qbvh_nodes.h | 3 |
14 files changed, 3367 insertions, 2 deletions
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index d3e0b25a200..81100ecd44d 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN /* Common QBVH functions. */ #ifdef __QBVH__ # include "kernel/bvh/qbvh_nodes.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_nodes.h" +#endif #endif /* Regular BVH traversal */ diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h index 9292cc76a5c..5a150fa52d5 100644 --- a/intern/cycles/kernel/bvh/bvh_local.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_local.h" +# ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_local.h" +# endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -244,6 +247,15 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, int max_hits) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + local_isect, + local_object, + lcg_state, + max_hits); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index cfc567ff9ca..d525b29fd94 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_shadow_all.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_shadow_all.h" +#endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, uint *num_hits) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect_array, + visibility, + max_hits, + num_hits); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 551625eae78..e95d2408201 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -20,6 +20,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_traversal.h" #endif +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_traversal.h" +#endif #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect @@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, ) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect, + visibility +# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + , lcg_state, + difl, + extmax +# endif + ); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index ead424aaaaf..4ca0dc2225e 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN /* 64 object BVH + 64 mesh BVH + 64 object node splitting */ #define BVH_STACK_SIZE 192 #define BVH_QSTACK_SIZE 384 - +#define BVH_OSTACK_SIZE 768 /* BVH intersection function variations */ #define BVH_INSTANCING 1 diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index ce5fc7be33d..7d03855cb8f 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_volume.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_volume.h" +#endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const uint visibility) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect, + visibility); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index f2379efc656..8df3122c044 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_volume_all.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_volume_all.h" +#endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -382,6 +385,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, const uint visibility) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect_array, + max_hits, + visibility); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h new file mode 100644 index 00000000000..b61340ac7a8 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_local.h @@ -0,0 +1,402 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for subsurface scattering, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device void BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + LocalIntersection *local_isect, + int local_object, + uint *lcg_state, + int max_hits) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, local_object); + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = ray->t; + + local_isect->num_hits = 0; + + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; + isect_t = bvh_instance_motion_push(kg, + local_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); +#else + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); +#endif + object = local_object; + } + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return; + } +#endif + + avxf tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); + + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* Intersect ray against primitive, */ + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits); + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + /* Intersect ray against primitive. */ + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits); + } + break; + } +#endif + default: + break; + } + } + } while(node_addr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h new file mode 100644 index 00000000000..93f35f6dffb --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_nodes.h @@ -0,0 +1,532 @@ +/* + * Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Aligned nodes intersection AVX code is adopted from Embree, + */ + +struct OBVHStackItem { + int addr; + float dist; +}; + +ccl_device_inline void obvh_near_far_idx_calc(const float3& idir, + int *ccl_restrict near_x, + int *ccl_restrict near_y, + int *ccl_restrict near_z, + int *ccl_restrict far_x, + int *ccl_restrict far_y, + int *ccl_restrict far_z) + +{ +#ifdef __KERNEL_SSE__ + *near_x = 0; *far_x = 1; + *near_y = 2; *far_y = 3; + *near_z = 4; *far_z = 5; + + const size_t mask = movemask(ssef(idir.m128)); + + const int mask_x = mask & 1; + const int mask_y = (mask & 2) >> 1; + const int mask_z = (mask & 4) >> 2; + + *near_x += mask_x; *far_x -= mask_x; + *near_y += mask_y; *far_y -= mask_y; + *near_z += mask_z; *far_z -= mask_z; +#else + if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; } + if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; } + if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; } +#endif +} + +ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, + OBVHStackItem *ccl_restrict b) +{ + OBVHStackItem tmp = *a; + *a = *b; + *b = tmp; +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3) +{ + if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } + if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } + if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4) +{ + if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } + if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); } + if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); } + if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); } + if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5) +{ + obvh_stack_sort(s1, s2, s3, s4); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5, + OBVHStackItem *ccl_restrict s6) +{ + obvh_stack_sort(s1, s2, s3, s4, s5); + if(s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5, + OBVHStackItem *ccl_restrict s6, + OBVHStackItem *ccl_restrict s7) +{ + obvh_stack_sort(s1, s2, s3, s4, s5, s6); + if(s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if(s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5, + OBVHStackItem *ccl_restrict s6, + OBVHStackItem *ccl_restrict s7, + OBVHStackItem *ccl_restrict s8) +{ + obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); + if(s8->dist < s7->dist) { + obvh_item_swap(s7, s8); + if(s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if(s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } + } +} + +/* Axis-aligned nodes intersection */ + +ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& org_idir, +#else + const avx3f& org, +#endif + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) +{ + const int offset = node_addr + 2; +#ifdef __KERNEL_AVX2__ + const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x); + const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y); + const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z); + const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x); + const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y); + const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z); + + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; +#else + return 0; +#endif +} + +ccl_device_inline int obvh_aligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& P_idir, +#else + const avx3f& P, +#endif + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) +{ + const int offset = node_addr + 2; +#ifdef __KERNEL_AVX2__ + const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x); + const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x); + const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y); + const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y); + const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z); + const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z); + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = round_down*tnear <= round_up*tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; +#else + return 0; +#endif +} + +/* Unaligned nodes intersection */ + +ccl_device_inline int obvh_unaligned_node_intersect( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& org_idir, +#endif + const avx3f& org, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); + + const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, + aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, + aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); +} + +ccl_device_inline int obvh_unaligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& P_idir, +#endif + const avx3f& P, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); + + const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, + aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, + aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = round_down*tnear <= round_up*tfar; + *dist = tnear; + return movemask(vmask); +} + +/* Intersectors wrappers. + * + * They'll check node type and call appropriate intersection code. + */ + +ccl_device_inline int obvh_node_intersect( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& org_idir, +#endif + const avx3f& org, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#endif + org, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } + else { + return obvh_aligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#else + org, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } +} + +ccl_device_inline int obvh_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& P_idir, +#endif + const avx3f& P, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#endif + P, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } + else { + return obvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#else + P, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } +} diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h new file mode 100644 index 00000000000..3e877065127 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h @@ -0,0 +1,687 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const int skip_object, + const uint max_hits, + uint *num_hits) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + + *num_hits = 0; + isect_array->t = tmax; + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + avxf tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; + + if(false +#ifdef __VISIBILITY_FLAG__ + || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) +#endif +#if BVH_FEATURE(BVH_MOTION) + || UNLIKELY(ray->time < inodes.y) + || UNLIKELY(ray->time > inodes.z) +#endif + ) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) +//#if !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + if(p_type == PRIMITIVE_TRIANGLE) { + int prim_count = prim_addr2 - prim_addr; + if(prim_count < 3) { + while(prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + int hit = triangle_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr); + /* Shadow ray early termination. */ + if(hit) { + /* detect if this surface has a shader with transparent shadows */ + + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; + +#ifdef __HAIR__ + if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) +#endif + { + shader = kernel_tex_fetch(__tri_shader, prim); + } +#ifdef __HAIR__ + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } +#endif + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if(*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + + isect_array->t = isect_t; + } + + prim_addr++; + } //while + } else { + kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type); + +#if BVH_FEATURE(BVH_INSTANCING) + int* nhiptr = &num_hits_in_instance; +#else + int nhi= 0; + int *nhiptr = &nhi; +#endif + + int result = triangle_intersect8(kg, + &isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + prim_count, + num_hits, + max_hits, + nhiptr, + isect_t); + if(result == 2) { + return true; + } + } // prim_count + } // PRIMITIVE_TRIANGLE + else { + while(prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + +#ifdef __SHADOW_TRICKS__ + uint tri_object = (object == OBJECT_NONE) + ? kernel_tex_fetch(__prim_object, prim_addr) + : object; + if(tri_object == skip_object) { + ++prim_addr; + continue; + } +#endif + + bool hit; + + /* todo: specialized intersect functions which don't fill in + * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? + * might give a few % performance improvement */ + + switch(p_type) { + +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + PATH_RAY_SHADOW, + object, + prim_addr); + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); + } + else { + hit = curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); + } + break; + } +#endif + default: { + hit = false; + break; + } + } + + /* Shadow ray early termination. */ + if(hit) { + /* detect if this surface has a shader with transparent shadows */ + + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; + +#ifdef __HAIR__ + if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) +#endif + { + shader = kernel_tex_fetch(__tri_shader, prim); + } +#ifdef __HAIR__ + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } +#endif + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if(*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + + isect_array->t = isect_t; + } + + prim_addr++; + }//while prim + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + +# if BVH_FEATURE(BVH_MOTION) + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); +# else + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); +# endif + + num_hits_in_instance = 0; + isect_array->t = isect_t; + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ + if(num_hits_in_instance) { + float t_fac; +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); +# else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +# endif + /* Scale isect->t to adjust for instancing. */ + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } + } + else { +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); +# else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); +# endif + } + + isect_t = tmax; + isect_array->t = isect_t; + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return false; +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h new file mode 100644 index 00000000000..2021d8e1143 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_traversal.h @@ -0,0 +1,642 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +# define NODE_INTERSECT_ROBUST obvh_node_intersect_robust +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + ,uint *lcg_state, + float difl, + float extmax +#endif + ) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + traversal_stack[0].dist = -FLT_MAX; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + float node_dist = -FLT_MAX; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + BVH_DEBUG_INIT(); + avxf tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; + + if(UNLIKELY(node_dist > isect->t) +#if BVH_FEATURE(BVH_MOTION) + || UNLIKELY(ray->time < inodes.y) + || UNLIKELY(ray->time > inodes.z) +#endif +#ifdef __VISIBILITY_FLAG__ + || (__float_as_uint(inodes.x) & visibility) == 0 +#endif + ) + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + int child_mask; + avxf dist; + + BVH_DEBUG_NEXT_NODE(); + +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + child_mask = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, +# ifdef __KERNEL_AVX2__ + P_idir4, +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + &dist); + } + else +#endif /* BVH_HAIR_MINIMUM_WIDTH */ + { + child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + } + + if(child_mask != 0) { + avxf cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here), + */ +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + float d0 = ((float*)&dist)[r]; + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + node_dist = d0; + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + node_dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + node_dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + +#ifdef __VISIBILITY_FLAG__ + if(UNLIKELY((node_dist > isect->t) || + ((__float_as_uint(leaf.z) & visibility) == 0))) +#else + if(UNLIKELY((node_dist > isect->t))) +#endif + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + + /* Primitive intersection. */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + int prim_count = prim_addr2 - prim_addr; + if(prim_count < 3) { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr)) + { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + }//for + } + else { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect8(kg, + &isect, + P, + dir, + visibility, + object, + prim_addr, + prim_count, + 0, + 0, + NULL, + 0.0f)) + { + tfar = avxf(isect->t); + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + }//prim count + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr)) + { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); + bool hit; + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + else { + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + if(hit) { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + +# if BVH_FEATURE(BVH_MOTION) + qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); +# else + qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + traversal_stack[stack_ptr].dist = -FLT_MAX; + + node_addr = kernel_tex_fetch(__object_node, object); + + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +# if BVH_FEATURE(BVH_MOTION) + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); +# else + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h new file mode 100644 index 00000000000..da9ddbd4f24 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_volume.h @@ -0,0 +1,483 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + avxf tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + + if((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch(p_type) { + case PRIMITIVE_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); + } + break; + } +#endif + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_HAS_VOLUME) { +# if BVH_FEATURE(BVH_MOTION) + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); +# else + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +# if BVH_FEATURE(BVH_MOTION) + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); +# else + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h new file mode 100644 index 00000000000..a88573e6f86 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_volume_all.h @@ -0,0 +1,554 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + const uint visibility) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + + uint num_hits = 0; + isect_array->t = tmax; + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return 0; + } +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + avxf tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + + if((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + bool hit; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch(p_type) { + case PRIMITIVE_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); +# else + Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); +# endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +# if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +# endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +# if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); +# else + Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); +# endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +# endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_HAS_VOLUME) { +# if BVH_FEATURE(BVH_MOTION) + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); +# else + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + num_hits_in_instance = 0; + isect_array->t = isect_t; + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ + if(num_hits_in_instance) { + float t_fac; +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); +# else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +# endif + /* Scale isect->t to adjust for instancing. */ + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } + } + else { +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); +# else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); +# endif + } + + isect_t = tmax; + isect_array->t = isect_t; + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return num_hits; +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 3036efd4198..2e622af1758 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, /* Axis-aligned nodes intersection */ -ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, +//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, +static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ |