diff options
-rw-r--r-- | intern/cycles/bvh/bvh.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 5 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh.h | 5 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_shadow.h | 26 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_subsurface.h | 29 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_traversal.h | 37 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_bvh_volume.h | 20 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh.h | 138 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_shadow.h | 378 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_subsurface.h | 300 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_traversal.h | 361 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_qbvh_volume.h | 320 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cpu.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_types.h | 7 | ||||
-rw-r--r-- | intern/cycles/render/mesh.cpp | 8 | ||||
-rw-r--r-- | intern/cycles/render/scene.h | 5 |
17 files changed, 1616 insertions, 33 deletions
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index b591d5973fe..e7141c9ec64 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -479,8 +479,11 @@ void BVH::pack_instances(size_t nodes_size) pack_nodes[pack_nodes_offset + nsize_bbox] = data; - if(use_qbvh) - pack_nodes[pack_nodes_offset + nsize_bbox+1] = bvh_nodes[i + nsize_bbox+1]; + if(use_qbvh) { + memcpy(&pack_nodes[pack_nodes_offset + nsize_bbox+1], + &bvh_nodes[i + nsize_bbox+1], + sizeof(int4) * (nsize - (nsize_bbox+1))); + } pack_nodes_offset += nsize; } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index ca1065f114a..f8d2ee60a3a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -123,6 +123,11 @@ set(SRC_GEOM_HEADERS geom/geom_motion_triangle.h geom/geom_object.h geom/geom_primitive.h + geom/geom_qbvh.h + geom/geom_qbvh_shadow.h + geom/geom_qbvh_subsurface.h + geom/geom_qbvh_traversal.h + geom/geom_qbvh_volume.h geom/geom_triangle.h geom/geom_triangle_intersect.h geom/geom_volume.h diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index 3a768f37dd9..38fd7858a99 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -21,6 +21,7 @@ /* 64 object BVH + 64 mesh BVH + 64 object node splitting */ #define BVH_STACK_SIZE 192 #define BVH_NODE_SIZE 4 +#define BVH_QNODE_SIZE 7 #define TRI_NODE_SIZE 3 /* silly workaround for float extended precision that happens when compiling diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h index a9892679e24..c0eefcd9c7f 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -48,6 +48,11 @@ CCL_NAMESPACE_BEGIN #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) +/* Common QBVH functions. */ +#ifdef __QBVH__ +#include "geom_qbvh.h" +#endif + /* Regular BVH traversal */ #define BVH_FUNCTION_NAME bvh_intersect diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h index 4bdfc7478aa..d6056026f24 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h @@ -17,6 +17,10 @@ * limitations under the License. */ +#ifdef __QBVH__ +#include "geom_qbvh_shadow.h" +#endif + /* This is a template BVH traversal function, where various features can be * enabled/disabled. This way we can compile optimized versions for each case * without new features slowing things down. @@ -380,11 +384,23 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const uint max_hits, uint *num_hits) { - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect_array, - max_hits, - num_hits); +#ifdef __QBVH__ + if(kernel_data.bvh.use_qbvh) { + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect_array, + max_hits, + num_hits); + } + else +#endif + { + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect_array, + max_hits, + num_hits); + } } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index 90cbbc08153..ff462142f6f 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -17,6 +17,10 @@ * limitations under the License. */ +#ifdef __QBVH__ +#include "geom_qbvh_subsurface.h" +#endif + /* This is a template BVH traversal function for subsurface scattering, where * various features can be enabled/disabled. This way we can compile optimized * versions for each case without new features slowing things down. @@ -300,12 +304,25 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, uint *lcg_state, int max_hits) { - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect_array, - subsurface_object, - lcg_state, - max_hits); +#ifdef __QBVH__ + if(kernel_data.bvh.use_qbvh) { + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect_array, + subsurface_object, + lcg_state, + max_hits); + } + else +#endif + { + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect_array, + subsurface_object, + lcg_state, + max_hits); + } } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h index d48eda5f554..6e5b6ea476e 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -17,6 +17,10 @@ * limitations under the License. */ +#ifdef __QBVH__ +#include "geom_qbvh_traversal.h" +#endif + /* This is a template BVH traversal function, where various features can be * enabled/disabled. This way we can compile optimized versions for each case * without new features slowing things down. @@ -381,16 +385,33 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #endif ) { - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect, - visibility +#ifdef __QBVH__ + if(kernel_data.bvh.use_qbvh) { + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect, + visibility +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + , lcg_state, + difl, + extmax +#endif + ); + } + else +#endif + { + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect, + visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , lcg_state, - difl, - extmax + , lcg_state, + difl, + extmax #endif - ); + ); + } } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h index bae90c2a24a..8a25b5dc884 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume.h +++ b/intern/cycles/kernel/geom/geom_bvh_volume.h @@ -17,6 +17,10 @@ * limitations under the License. */ +#ifdef __QBVH__ +#include "geom_qbvh_volume.h" +#endif + /* This is a template BVH traversal function for volumes, where * various features can be enabled/disabled. This way we can compile optimized * versions for each case without new features slowing things down. @@ -314,9 +318,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect) { - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect); +#ifdef __QBVH__ + if(kernel_data.bvh.use_qbvh) { + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect); + } + else +#endif + { + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect); + } } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h new file mode 100644 index 00000000000..a1dd89c41ca --- /dev/null +++ b/intern/cycles/kernel/geom/geom_qbvh.h @@ -0,0 +1,138 @@ +/* + * Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +ccl_device_inline void qbvh_stack_sort(int *__restrict s1, + int *__restrict s2, + int *__restrict s3, + float *__restrict d1, + float *__restrict d2, + float *__restrict d3) +{ + if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); } + if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); } + if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); } +} + +ccl_device_inline void qbvh_stack_sort(int *__restrict s1, + int *__restrict s2, + int *__restrict s3, + int *__restrict s4, + float *__restrict d1, + float *__restrict d2, + float *__restrict d3, + float *__restrict d4) +{ + if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); } + if(*d4 < *d3) { util_swap(s4, s3); util_swap(d4, d3); } + if(*d3 < *d1) { util_swap(s3, s1); util_swap(d3, d1); } + if(*d4 < *d2) { util_swap(s4, s2); util_swap(d4, d2); } + if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); } +} + +ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#else + const sse3f& org, +#endif + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + ssef *__restrict dist) +{ + const int offset = nodeAddr*BVH_QNODE_SIZE; +#ifdef __KERNEL_AVX2__ + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); +#else + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; +#endif + +#ifdef __KERNEL_SSE41__ + const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear)); + const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar)); + const sseb vmask = cast(tNear) > cast(tFar); + int mask = (int)movemask(vmask)^0xf; +#else + const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); + const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); + const sseb vmask = tNear <= tFar; + int mask = (int)movemask(vmask); +#endif + *dist = tNear; + return mask; +} + +ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg, + const ssef& tnear, + const ssef& tfar, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#else + const sse3f& P, +#endif + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int nodeAddr, + const float difl, + ssef *__restrict dist) +{ + const int offset = nodeAddr*BVH_QNODE_SIZE; +#ifdef __KERNEL_AVX2__ + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); +#else + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; +#endif + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); + const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); + const sseb vmask = round_down*tNear <= round_up*tFar; + *dist = tNear; + return (int)movemask(vmask); +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h new file mode 100644 index 00000000000..f8279996450 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h @@ -0,0 +1,378 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + uint *num_hits) +{ + /* TODO(sergey): + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + *num_hits = 0; + isect_array->t = tmax; + + ssef tnear(0.0f), tfar(tmax); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + ssef dist; + int traverseChild = qbvh_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#else + org, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); + + if(traverseChild != 0) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + + /* One child is hit, continue with that child. */ + int r = __bscf(traverseChild); + if(traverseChild == 0) { + nodeAddr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(traverseChild); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(traverseChild == 0) { + if(d1 < d0) { + nodeAddr = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + continue; + } + else { + nodeAddr = c0; + ++stackPtr; + traversalStack[stackPtr] = c1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stackPtr; + traversalStack[stackPtr] = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(traverseChild == 0) { + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &d2, &d1, &d0); + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stackPtr; + traversalStack[stackPtr] = c3; + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &traversalStack[stackPtr - 3], + &d3, &d2, &d1, &d0); + } + + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + + /* If node is leaf, fetch triangle list. */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + int primAddr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* Pop. */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + continue; + } +#endif + + /* Primitive intersection. */ + while(primAddr < primAddr2) { + bool hit; + uint type = kernel_tex_fetch(__prim_type, primAddr); + + /* todo: specialized intersect functions which don't fill in + * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? + * might give a few % performance improvement */ + + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr); + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr); + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); + else + hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); + break; + } +#endif + default: { + hit = false; + break; + } + } + + /* Shadow ray early termination. */ + if(hit) { + /* detect if this surface has a shader with transparent shadows */ + + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; + +#ifdef __HAIR__ + if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) +#endif + { + shader = kernel_tex_fetch(__tri_shader, prim); + } +#ifdef __HAIR__ + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } +#endif + int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2); + + /* if no transparent shadows, all light is blocked */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if(*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + + isect_array->t = isect_t; + } + + primAddr++; + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); +#endif + + num_hits_in_instance = 0; + isect_array->t = isect_t; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect_t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + if(num_hits_in_instance) { + float t_fac; + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm); +#else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +#endif + + /* scale isect->t to adjust for instancing */ + for(int i = 0; i < num_hits_in_instance; i++) + (isect_array-i-1)->t *= t_fac; + } + else { + float ignore_t = FLT_MAX; + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); +#endif + } + + isect_t = tmax; + isect_array->t = isect_t; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(tmax); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return false; +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h new file mode 100644 index 00000000000..bc43d81f9d3 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h @@ -0,0 +1,300 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for subsurface scattering, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_MOTION: motion blur rendering + * + */ + +ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + int subsurface_object, + uint *lcg_state, + int max_hits) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps (for non shadow rays). + * - Separate version for shadow rays. + * - Likely and unlikely for if() statements. + * - SSE for hair. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = ray->t; + uint num_hits = 0; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + + ssef tnear(0.0f), tfar(isect_t); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + ssef dist; + int traverseChild = qbvh_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#else + org, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); + + if(traverseChild != 0) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + + /* One child is hit, continue with that child. */ + int r = __bscf(traverseChild); + if(traverseChild == 0) { + nodeAddr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(traverseChild); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(traverseChild == 0) { + if(d1 < d0) { + nodeAddr = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + continue; + } + else { + nodeAddr = c0; + ++stackPtr; + traversalStack[stackPtr] = c1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stackPtr; + traversalStack[stackPtr] = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(traverseChild == 0) { + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &d2, &d1, &d0); + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stackPtr; + traversalStack[stackPtr] = c3; + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &traversalStack[stackPtr - 3], + &d3, &d2, &d1, &d0); + } + + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + + /* If node is leaf, fetch triangle list. */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + int primAddr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* Pop. */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* Primitive intersection. */ + for(; primAddr < primAddr2; primAddr++) { + /* only primitives from the same object */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + + if(tri_object != subsurface_object) + continue; + + /* Intersect ray against primitive */ + uint type = kernel_tex_fetch(__prim_type, primAddr); + + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + break; + } +#endif + default: { + break; + } + } + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) { + object = subsurface_object; + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect_t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect_t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return num_hits; +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h new file mode 100644 index 00000000000..56289900e80 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h @@ -0,0 +1,361 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width + * BVH_MOTION: motion blur rendering + * + */ + +ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + ,uint *lcg_state, + float difl, + float extmax +#endif + ) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps (for non shadow rays). + * - Separate version for shadow rays. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps = 0; +#endif + + ssef tnear(0.0f), tfar(ray->t); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + int traverseChild; + ssef dist; + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif + +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + traverseChild = qbvh_node_intersect_robust(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#else + org, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + difl, + &dist); + } + else +#endif + { + traverseChild = qbvh_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#else + org, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); + } + + if(traverseChild != 0) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + + /* One child is hit, continue with that child. */ + int r = __bscf(traverseChild); + if(traverseChild == 0) { + nodeAddr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(traverseChild); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(traverseChild == 0) { + if(d1 < d0) { + nodeAddr = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + continue; + } + else { + nodeAddr = c0; + ++stackPtr; + traversalStack[stackPtr] = c1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stackPtr; + traversalStack[stackPtr] = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(traverseChild == 0) { + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &d2, &d1, &d0); + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stackPtr; + traversalStack[stackPtr] = c3; + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &traversalStack[stackPtr - 3], + &d3, &d2, &d1, &d0); + } + + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + + /* If node is leaf, fetch triangle list. */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + int primAddr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* Pop. */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(leaf.z) & visibility) == 0) { + continue; + } +#endif + + /* Primitive intersection. */ + while(primAddr < primAddr2) { + bool hit; + uint type = kernel_tex_fetch(__prim_type, primAddr); + + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + hit = triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr); + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); + else + hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); + break; + } +#endif + default: { + hit = false; + break; + } + } + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif + + /* Shadow ray early termination. */ + if(hit) { + tfar = ssef(isect->t); + if(visibility == PATH_RAY_SHADOW_OPAQUE) + return true; + } + + primAddr++; + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h new file mode 100644 index 00000000000..3630436bddc --- /dev/null +++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h @@ -0,0 +1,320 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + + const uint visibility = PATH_RAY_ALL_VISIBILITY; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + ssef tnear(0.0f), tfar(ray->t); + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif + + ssef dist; + int traverseChild = qbvh_node_intersect(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#else + org, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + nodeAddr, + &dist); + + if(traverseChild != 0) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + + /* One child is hit, continue with that child. */ + int r = __bscf(traverseChild); + if(traverseChild == 0) { + nodeAddr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(traverseChild); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(traverseChild == 0) { + if(d1 < d0) { + nodeAddr = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + continue; + } + else { + nodeAddr = c0; + ++stackPtr; + traversalStack[stackPtr] = c1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stackPtr; + traversalStack[stackPtr] = c1; + ++stackPtr; + traversalStack[stackPtr] = c0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(traverseChild == 0) { + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &d2, &d1, &d0); + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(traverseChild); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stackPtr; + traversalStack[stackPtr] = c3; + ++stackPtr; + traversalStack[stackPtr] = c2; + qbvh_stack_sort(&traversalStack[stackPtr], + &traversalStack[stackPtr - 1], + &traversalStack[stackPtr - 2], + &traversalStack[stackPtr - 3], + &d3, &d2, &d1, &d0); + } + + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + + /* If node is leaf, fetch triangle list. */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6); + int primAddr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* Pop. */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* Primitive intersection. */ + for(; primAddr < primAddr2; primAddr++) { + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + + /* Intersect ray against primitive. */ + uint type = kernel_tex_fetch(__prim_type, primAddr); + + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr); + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + else + bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + break; + } +#endif + default: { + break; + } + } + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VOLUME) { + +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +#if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +#ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#else + org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + triangle_intersect_precalc(dir, &isect_precalc); + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 2f0b78ea073..8140a3b7725 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -52,7 +52,6 @@ template<typename T> struct texture { return data[index]; } -#if 0 ccl_always_inline ssef fetch_ssef(int index) { kernel_assert(index >= 0 && index < width); @@ -64,7 +63,6 @@ template<typename T> struct texture { kernel_assert(index >= 0 && index < width); return ((ssei*)data)[index]; } -#endif T *data; int width; diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 460ca7b68eb..1bc5cf1fc32 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -57,6 +57,9 @@ CCL_NAMESPACE_BEGIN /* device capabilities */ #ifdef __KERNEL_CPU__ +#ifdef __KERNEL_SSE2__ +# define __QBVH__ +#endif #define __KERNEL_SHADING__ #define __KERNEL_ADV_SHADING__ #define __BRANCHED_PATH__ @@ -947,8 +950,8 @@ typedef struct KernelBVH { int have_motion; int have_curves; int have_instancing; - - int pad1, pad2, pad3; + int use_qbvh; + int pad1, pad2; } KernelBVH; typedef enum CurveFlag { diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 6137f7d4fdc..f8671db18dd 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -1027,6 +1027,7 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * } dscene->data.bvh.root = pack.root_index; + dscene->data.bvh.use_qbvh = scene->params.use_qbvh; } void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) @@ -1094,7 +1095,12 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen foreach(Mesh *mesh, scene->meshes) { if(mesh->need_update) { - pool.push(function_bind(&Mesh::compute_bvh, mesh, &scene->params, &progress, i, num_bvh)); + pool.push(function_bind(&Mesh::compute_bvh, + mesh, + &scene->params, + &progress, + i, + num_bvh)); i++; } } diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 5d205225d97..51324edd8ff 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -26,6 +26,7 @@ #include "util_param.h" #include "util_string.h" +#include "util_system.h" #include "util_thread.h" #include "util_types.h" #include "util_vector.h" @@ -135,11 +136,7 @@ public: bvh_type = BVH_DYNAMIC; use_bvh_cache = false; use_bvh_spatial_split = false; -#ifdef __QBVH__ - use_qbvh = true; -#else use_qbvh = false; -#endif persistent_data = false; } |