diff options
37 files changed, 5079 insertions, 65 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index c97d942af9d..80b83c94012 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -50,6 +50,7 @@ enum_displacement_methods = ( enum_bvh_layouts = ( ('BVH2', "BVH2", "", 1), ('BVH4', "BVH4", "", 2), + ('BVH8', "BVH8", "", 4), ) enum_bvh_types = ( @@ -686,7 +687,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): cls.debug_bvh_layout = EnumProperty( name="BVH Layout", items=enum_bvh_layouts, - default='BVH4', + default='BVH8', ) cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False) diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index fedcb874269..fcdadf0ad6e 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -679,7 +679,15 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, params.texture_limit = 0; } - params.bvh_layout = DebugFlags().cpu.bvh_layout; + /* TODO(sergey): Once OSL supports per-microarchitecture optimization get + * rid of this. + */ + if (params.shadingsystem == SHADINGSYSTEM_OSL) { + params.bvh_layout = BVH_LAYOUT_BVH4; + } + else { + params.bvh_layout = DebugFlags().cpu.bvh_layout; + } return params; } diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index b8171e7f70d..fcd28572fdf 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -10,6 +10,7 @@ set(SRC bvh.cpp bvh2.cpp bvh4.cpp + bvh8.cpp bvh_binning.cpp bvh_build.cpp bvh_node.cpp @@ -22,6 +23,7 @@ set(SRC_HEADERS bvh.h bvh2.h bvh4.h + bvh8.h bvh_binning.h bvh_build.h bvh_node.h diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index b524ca07d8d..bc73a3ad264 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -22,6 +22,7 @@ #include "bvh/bvh2.h" #include "bvh/bvh4.h" +#include "bvh/bvh8.h" #include "bvh/bvh_build.h" #include "bvh/bvh_node.h" @@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout) switch(layout) { case BVH_LAYOUT_BVH2: return "BVH2"; case BVH_LAYOUT_BVH4: return "BVH4"; + case BVH_LAYOUT_BVH8: return "BVH8"; case BVH_LAYOUT_NONE: return "NONE"; case BVH_LAYOUT_ALL: return "ALL"; } @@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects) return new BVH2(params, objects); case BVH_LAYOUT_BVH4: return new BVH4(params, objects); + case BVH_LAYOUT_BVH8: + return new BVH8(params, objects); case BVH_LAYOUT_NONE: case BVH_LAYOUT_ALL: break; @@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility) } } visibility |= ob->visibility_for_tracing(); + + } +} + +bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh) +{ + if(node->is_leaf()) { + return node->is_unaligned; + } + else { + return node_is_unaligned(node, bvh); + } +} + +bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh) +{ + const BVHNode *node0 = node->get_child(0); + const BVHNode *node1 = node->get_child(1); + + switch(bvh) { + case bvh2: + return node0->is_unaligned || node1->is_unaligned; + break; + case bvh4: + return leaf_check(node0, bvh2) || leaf_check(node1, bvh2); + break; + case bvh8: + return leaf_check(node0, bvh4) || leaf_check(node1, bvh4); + break; + default: + assert(0); + return false; } } @@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) * BVH's are stored in global arrays. This function merges them into the * top level BVH, adjusting indexes and offsets where appropriate. */ - /* TODO(sergey): This code needs adjustment for wider BVH than 4. */ const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4); + const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8); /* Adjust primitive index to point to the triangle in the global array, for * meshes with transform applied and already in the top level BVH. @@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) { size_t nsize, nsize_bbox; if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) { - nsize = use_qbvh - ? BVH_UNALIGNED_QNODE_SIZE - : BVH_UNALIGNED_NODE_SIZE; - nsize_bbox = (use_qbvh)? 13: 0; + if(use_obvh) { + nsize = BVH_UNALIGNED_ONODE_SIZE; + nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1; + } + else { + nsize = use_qbvh + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_UNALIGNED_NODE_SIZE; + nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0; + } } else { - nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE; - nsize_bbox = (use_qbvh)? 7: 0; + if(use_obvh) { + nsize = BVH_ONODE_SIZE; + nsize_bbox = BVH_ONODE_SIZE-1; + } + else { + nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE; + nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0; + } } memcpy(pack_nodes + pack_nodes_offset, @@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) /* Modify offsets into arrays */ int4 data = bvh_nodes[i + nsize_bbox]; - - data.z += (data.z < 0)? -noffset_leaf: noffset; - data.w += (data.w < 0)? -noffset_leaf: noffset; - - if(use_qbvh) { - data.x += (data.x < 0)? -noffset_leaf: noffset; - data.y += (data.y < 0)? -noffset_leaf: noffset; + int4 data1 = bvh_nodes[i + nsize_bbox-1]; + if(use_obvh) { + data.z += (data.z < 0) ? -noffset_leaf : noffset; + data.w += (data.w < 0) ? -noffset_leaf : noffset; + data.x += (data.x < 0) ? -noffset_leaf : noffset; + data.y += (data.y < 0) ? -noffset_leaf : noffset; + data1.z += (data1.z < 0) ? -noffset_leaf : noffset; + data1.w += (data1.w < 0) ? -noffset_leaf : noffset; + data1.x += (data1.x < 0) ? -noffset_leaf : noffset; + data1.y += (data1.y < 0) ? -noffset_leaf : noffset; + } + else { + data.z += (data.z < 0) ? -noffset_leaf : noffset; + data.w += (data.w < 0) ? -noffset_leaf : noffset; + if(use_qbvh) { + data.x += (data.x < 0)? -noffset_leaf: noffset; + data.y += (data.y < 0)? -noffset_leaf: noffset; + } } - pack_nodes[pack_nodes_offset + nsize_bbox] = data; + if(use_obvh) { + pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1; + } /* Usually this copies nothing, but we better * be prepared for possible node size extension. diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index 6a82f915692..86be0bae4be 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -73,6 +73,12 @@ struct PackedBVH { } }; +enum BVH_TYPE { + bvh2, + bvh4, + bvh8 +}; + /* BVH */ class BVH @@ -93,6 +99,8 @@ protected: /* Refit range of primitives. */ void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility); + static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh); + static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh); /* triangles and strands */ void pack_primitives(); diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp index 9d89d2b6afb..4a423c16559 100644 --- a/intern/cycles/bvh/bvh2.cpp +++ b/intern/cycles/bvh/bvh2.cpp @@ -25,13 +25,6 @@ CCL_NAMESPACE_BEGIN -static bool node_bvh_is_unaligned(const BVHNode *node) -{ - const BVHNode *node0 = node->get_child(0), - *node1 = node->get_child(1); - return node0->is_unaligned || node1->is_unaligned; -} - BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_) : BVH(params_, objects_) { @@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root) } else { stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += node_bvh_is_unaligned(root) + nextNodeIdx += node_is_unaligned(root, bvh2) ? BVH_UNALIGNED_NODE_SIZE : BVH_NODE_SIZE; } @@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root) } else { idx[i] = nextNodeIdx; - nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i)) + nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2) ? BVH_UNALIGNED_NODE_SIZE : BVH_NODE_SIZE; } diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp index 4faf47af7bb..a449587e607 100644 --- a/intern/cycles/bvh/bvh4.cpp +++ b/intern/cycles/bvh/bvh4.cpp @@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN * Perhaps we can merge nodes in actual tree and make our * life easier all over the place. */ -static bool node_qbvh_is_unaligned(const BVHNode *node) -{ - const BVHNode *node0 = node->get_child(0), - *node1 = node->get_child(1); - bool has_unaligned = false; - if(node0->is_leaf()) { - has_unaligned |= node0->is_unaligned; - } - else { - has_unaligned |= node0->get_child(0)->is_unaligned; - has_unaligned |= node0->get_child(1)->is_unaligned; - } - if(node1->is_leaf()) { - has_unaligned |= node1->is_unaligned; - } - else { - has_unaligned |= node1->get_child(0)->is_unaligned; - has_unaligned |= node1->get_child(1)->is_unaligned; - } - return has_unaligned; -} BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_) : BVH(params_, objects_) @@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root) } else { stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += node_qbvh_is_unaligned(root) + nextNodeIdx += node_is_unaligned(root, bvh4) ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE; } @@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root) } else { idx = nextNodeIdx; - nextNodeIdx += node_qbvh_is_unaligned(nodes[i]) + nextNodeIdx += node_is_unaligned(nodes[i], bvh4) ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE; } @@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) visibility, 0.0f, 1.0f, - 4); + num_nodes); } else { pack_aligned_node(idx, @@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) visibility, 0.0f, 1.0f, - 4); + num_nodes); } } } diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp new file mode 100644 index 00000000000..e8b6726602c --- /dev/null +++ b/intern/cycles/bvh/bvh8.cpp @@ -0,0 +1,515 @@ +/* +Copyright (c) 2017, Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of Intel Corporation nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "bvh/bvh8.h" + +#include "render/mesh.h" +#include "render/object.h" + +#include "bvh/bvh_node.h" +#include "bvh/bvh_unaligned.h" + +CCL_NAMESPACE_BEGIN + +BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_) +: BVH(params_, objects_) +{ +} + +void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) +{ + float4 data[BVH_ONODE_LEAF_SIZE]; + memset(data, 0, sizeof(data)); + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { + /* object */ + data[0].x = __int_as_float(~(leaf->lo)); + data[0].y = __int_as_float(0); + } + else { + /* triangle */ + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); + } + data[0].z = __uint_as_float(leaf->visibility); + if(leaf->num_triangles() != 0) { + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); + } + + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE); +} + +void BVH8::pack_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + bool has_unaligned = false; + /* Check whether we have to create unaligned node or all nodes are aligned + * and we can cut some corner here. + */ + if(params.use_unaligned_nodes) { + for(int i = 0; i < num; i++) { + if(en[i].node->is_unaligned) { + has_unaligned = true; + break; + } + } + } + if(has_unaligned) { + /* There's no unaligned children, pack into AABB node. */ + pack_unaligned_inner(e, en, num); + } + else { + /* Create unaligned node with orientation transform for each of the + * children. + */ + pack_aligned_inner(e, en, num); + } +} + +void BVH8::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + BoundBox bounds[8]; + int child[8]; + for(int i = 0; i < num; ++i) { + bounds[i] = en[i].node->bounds; + child[i] = en[i].encodeIdx(); + } + pack_aligned_node(e.idx, + bounds, + child, + e.node->visibility, + e.node->time_from, + e.node->time_to, + num); +} + +void BVH8::pack_aligned_node(int idx, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num) +{ + float8 data[8]; + memset(data, 0, sizeof(data)); + + data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); + data[0].b = time_from; + data[0].c = time_to; + for(int i = 0; i < num; i++) { + float3 bb_min = bounds[i].min; + float3 bb_max = bounds[i].max; + + data[1][i] = bb_min.x; + data[2][i] = bb_max.x; + data[3][i] = bb_min.y; + data[4][i] = bb_max.y; + data[5][i] = bb_min.z; + data[6][i] = bb_max.z; + + data[7][i] = __int_as_float(child[i]); + } + + for(int i = num; i < 8; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + data[1][i] = FLT_MAX; + data[2][i] = -FLT_MAX; + + data[3][i] = FLT_MAX; + data[4][i] = -FLT_MAX; + + data[5][i] = FLT_MAX; + data[6][i] = -FLT_MAX; + + data[7][i] = __int_as_float(0); + } + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE); +} + +void BVH8::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + Transform aligned_space[8]; + BoundBox bounds[8]; + int child[8]; + for(int i = 0; i < num; ++i) { + aligned_space[i] = en[i].node->get_aligned_space(); + bounds[i] = en[i].node->bounds; + child[i] = en[i].encodeIdx(); + } + pack_unaligned_node(e.idx, + aligned_space, + bounds, + child, + e.node->visibility, + e.node->time_from, + e.node->time_to, + num); +} + +void BVH8::pack_unaligned_node(int idx, + const Transform *aligned_space, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num) +{ + float8 data[BVH_UNALIGNED_ONODE_SIZE]; + memset(data, 0, sizeof(data)); + data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); + data[0].b = time_from; + data[0].c = time_to; + + for(int i = 0; i < num; i++) { + Transform space = BVHUnaligned::compute_node_transform( + bounds[i], + aligned_space[i]); + + data[1][i] = space.x.x; + data[2][i] = space.x.y; + data[3][i] = space.x.z; + + data[4][i] = space.y.x; + data[5][i] = space.y.y; + data[6][i] = space.y.z; + + data[7][i] = space.z.x; + data[8][i] = space.z.y; + data[9][i] = space.z.z; + + data[10][i] = space.x.w; + data[11][i] = space.y.w; + data[12][i] = space.z.w; + + data[13][i] = __int_as_float(child[i]); + } + + for(int i = num; i < 8; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + + data[1][i] = 1.0f; + data[2][i] = 0.0f; + data[3][i] = 0.0f; + + data[4][i] = 0.0f; + data[5][i] = 0.0f; + data[6][i] = 0.0f; + + data[7][i] = 0.0f; + data[8][i] = 0.0f; + data[9][i] = 0.0f; + + data[10][i] = -FLT_MAX; + data[11][i] = -FLT_MAX; + data[12][i] = -FLT_MAX; + + data[13][i] = __int_as_float(0); + } + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE); +} + +/* Quad SIMD Nodes */ + +void BVH8::pack_nodes(const BVHNode *root) +{ + /* Calculate size of the arrays required. */ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_ONODE_SIZE; + } + /* Resize arrays. */ + pack.nodes.clear(); + pack.leaf_nodes.clear(); + /* For top level BVH, first merge existing BVH's so we know the offsets. */ + if(params.top_level) { + pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE); + } + else { + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE); + } + + int nextNodeIdx = 0, nextLeafNodeIdx = 0; + + vector<BVHStackEntry> stack; + stack.reserve(BVHParams::MAX_DEPTH*2); + if(root->is_leaf()) { + stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); + } + else { + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_is_unaligned(root, bvh8) + ? BVH_UNALIGNED_ONODE_SIZE + : BVH_ONODE_SIZE; + } + + while(stack.size()) { + BVHStackEntry e = stack.back(); + stack.pop_back(); + + if(e.node->is_leaf()) { + /* leaf node */ + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); + pack_leaf(e, leaf); + } + else { + /* Inner node. */ + const BVHNode *node = e.node; + const BVHNode *node0 = node->get_child(0); + const BVHNode *node1 = node->get_child(1); + /* Collect nodes. */ + const BVHNode *nodes[8]; + int numnodes = 0; + if(node0->is_leaf()) { + nodes[numnodes++] = node0; + } + else { + const BVHNode *node00 = node0->get_child(0), + *node01 = node0->get_child(1); + if(node00->is_leaf()) { + nodes[numnodes++] = node00; + } + else { + nodes[numnodes++] = node00->get_child(0); + nodes[numnodes++] = node00->get_child(1); + } + if(node01->is_leaf()) { + nodes[numnodes++] = node01; + } + else { + nodes[numnodes++] = node01->get_child(0); + nodes[numnodes++] = node01->get_child(1); + } + } + if(node1->is_leaf()) { + nodes[numnodes++] = node1; + } + else { + const BVHNode *node10 = node1->get_child(0), + *node11 = node1->get_child(1); + if(node10->is_leaf()) { + nodes[numnodes++] = node10; + } + else { + nodes[numnodes++] = node10->get_child(0); + nodes[numnodes++] = node10->get_child(1); + } + if(node11->is_leaf()) { + nodes[numnodes++] = node11; + } + else { + nodes[numnodes++] = node11->get_child(0); + nodes[numnodes++] = node11->get_child(1); + } + } + /* Push entries on the stack. */ + for(int i = 0; i < numnodes; ++i) { + int idx; + if(nodes[i]->is_leaf()) { + idx = nextLeafNodeIdx++; + } + else { + idx = nextNodeIdx; + nextNodeIdx += node_is_unaligned(nodes[i], bvh8) + ? BVH_UNALIGNED_ONODE_SIZE + : BVH_ONODE_SIZE; + } + stack.push_back(BVHStackEntry(nodes[i], idx)); + } + /* Set node. */ + pack_inner(e, &stack[stack.size() - numnodes], numnodes); + } + } + assert(node_size == nextNodeIdx); + /* Root index to start traversal at, to handle case of single leaf node. */ + pack.root_index = (root->is_leaf()) ? -1 : 0; +} + +void BVH8::refit_nodes() +{ + assert(!params.top_level); + + BoundBox bbox = BoundBox::empty; + uint visibility = 0; + refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); +} + +void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) +{ + if(leaf) { + int4 *data = &pack.leaf_nodes[idx]; + int4 c = data[0]; + /* Refit leaf node. */ + for(int prim = c.x; prim < c.y; prim++) { + int pidx = pack.prim_index[prim]; + int tob = pack.prim_object[prim]; + Object *ob = objects[tob]; + + if(pidx == -1) { + /* Object instance. */ + bbox.grow(ob->bounds); + } + else { + /* Primitives. */ + const Mesh *mesh = ob->mesh; + + if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { + /* Curves. */ + int str_offset = (params.top_level) ? mesh->curve_offset : 0; + Mesh::Curve curve = mesh->get_curve(pidx - str_offset); + int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); + + curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); + + visibility |= PATH_RAY_CURVE; + + /* Motion curves. */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->curve_keys.size(); + size_t steps = mesh->motion_steps - 1; + float3 *key_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) { + curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); + } + } + } + } + else { + /* Triangles. */ + int tri_offset = (params.top_level) ? mesh->tri_offset : 0; + Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); + const float3 *vpos = &mesh->verts[0]; + + triangle.bounds_grow(vpos, bbox); + + /* Motion triangles. */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->verts.size(); + size_t steps = mesh->motion_steps - 1; + float3 *vert_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) { + triangle.bounds_grow(vert_steps + i*mesh_size, bbox); + } + } + } + } + } + + visibility |= ob->visibility; + } + + float4 leaf_data[BVH_ONODE_LEAF_SIZE]; + leaf_data[0].x = __int_as_float(c.x); + leaf_data[0].y = __int_as_float(c.y); + leaf_data[0].z = __uint_as_float(visibility); + leaf_data[0].w = __uint_as_float(c.w); + memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE); + } + else { + int4 *data = &pack.nodes[idx]; + bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; + int4 c; + if(is_unaligned) { + c = data[BVH_UNALIGNED_ONODE_SIZE-1]; + } + else { + c = data[BVH_ONODE_SIZE-1]; + } + /* Refit inner node, set bbox from children. */ + BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty, + BoundBox::empty, BoundBox::empty, + BoundBox::empty, BoundBox::empty, + BoundBox::empty, BoundBox::empty }; + uint child_visibility[8] = { 0 }; + int num_nodes = 0; + + for(int i = 0; i < 8; ++i) { + if(c[i] != 0) { + refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0), + child_bbox[i], child_visibility[i]); + ++num_nodes; + bbox.grow(child_bbox[i]); + visibility |= child_visibility[i]; + } + } + + if(is_unaligned) { + Transform aligned_space[8] = { transform_identity(), transform_identity(), + transform_identity(), transform_identity(), + transform_identity(), transform_identity(), + transform_identity(), transform_identity()}; + pack_unaligned_node(idx, + aligned_space, + child_bbox, + &c[0], + visibility, + 0.0f, + 1.0f, + num_nodes); + } + else { + pack_aligned_node(idx, + child_bbox, + &c[0], + visibility, + 0.0f, + 1.0f, + num_nodes); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h new file mode 100644 index 00000000000..99ff79e60b5 --- /dev/null +++ b/intern/cycles/bvh/bvh8.h @@ -0,0 +1,97 @@ +/* +Copyright (c) 2017, Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of Intel Corporation nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __BVH8_H__ +#define __BVH8_H__ + +#include "bvh/bvh.h" +#include "bvh/bvh_params.h" + +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class BVHNode; +struct BVHStackEntry; +class BVHParams; +class BoundBox; +class LeafNode; +class Object; +class Progress; + +#define BVH_ONODE_SIZE 16 +#define BVH_ONODE_LEAF_SIZE 1 +#define BVH_UNALIGNED_ONODE_SIZE 28 + +/* BVH8 +* +* Octo BVH, with each node having eight children, to use with SIMD instructions. +*/ +class BVH8 : public BVH { +protected: + /* constructor */ + friend class BVH; + BVH8(const BVHParams& params, const vector<Object*>& objects); + + /* pack */ + void pack_nodes(const BVHNode *root); + + void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); + void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num); + + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_aligned_node(int idx, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num); + + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_unaligned_node(int idx, + const Transform *aligned_space, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num); + + /* refit */ + void refit_nodes(); + void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); +}; + +CCL_NAMESPACE_END + +#endif /* __BVH8_H__ */ diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp index 879d07b9625..fd8b4977699 100644 --- a/intern/cycles/bvh/bvh_node.cpp +++ b/intern/cycles/bvh/bvh_node.cpp @@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const } } return cnt; + case BVH_STAT_ONODE_COUNT: + cnt = 1; + for(int i = 0; i < num_children(); i++) { + BVHNode *node = get_child(i); + if(node->is_leaf()) { + cnt += 1; + } + else { + for(int j = 0; j < node->num_children(); j++) + { + BVHNode *node_next = node->get_child(j); + if(node_next->is_leaf()) { + cnt += 1; + } + else { + for(int k = 0; k < node_next->num_children(); k++) { + cnt += node_next->get_child(k)->getSubtreeSize(stat); + } + } + } + } + } + return cnt; + case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT: + { + bool has_unaligned = false; + for(int i = 0; i < num_children(); i++) { + BVHNode *node = get_child(i); + if(node->is_leaf()) { + has_unaligned |= node->is_unaligned; + } + else { + for(int j = 0; j < node->num_children(); j++) { + BVHNode *node_next = node->get_child(j); + if(node_next->is_leaf()) { + has_unaligned |= node_next->is_unaligned; + } + else { + for(int k = 0; k < node_next->num_children(); k++) { + cnt += node_next->get_child(k)->getSubtreeSize(stat); + has_unaligned |= node_next->get_child(k)->is_unaligned; + } + } + } + } + } + cnt += has_unaligned? 1: 0; + } + return cnt; case BVH_STAT_ALIGNED_COUNT: if(!is_unaligned) { cnt = 1; diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index 94cf5ab730c..ed89d52a50a 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -39,6 +39,8 @@ enum BVH_STAT { BVH_STAT_ALIGNED_LEAF_COUNT, BVH_STAT_UNALIGNED_LEAF_COUNT, BVH_STAT_DEPTH, + BVH_STAT_ONODE_COUNT, + BVH_STAT_UNALIGNED_INNER_ONODE_COUNT, }; class BVHParams; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index e92bbbfa6e6..5b3761f8353 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -1031,6 +1031,9 @@ void device_cpu_info(vector<DeviceInfo>& devices) if(system_cpu_support_sse2()) { info.bvh_layout_mask |= BVH_LAYOUT_BVH4; } + if (system_cpu_support_avx2()) { + info.bvh_layout_mask |= BVH_LAYOUT_BVH8; + } info.has_volume_decoupled = true; info.has_osl = true; info.has_half_images = true; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 118e18374a1..c6e92c6d89d 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS bvh/qbvh_traversal.h bvh/qbvh_volume.h bvh/qbvh_volume_all.h + bvh/obvh_nodes.h + bvh/obvh_shadow_all.h + bvh/obvh_local.h + bvh/obvh_traversal.h + bvh/obvh_volume.h + bvh/obvh_volume_all.h ) set(SRC_HEADERS @@ -270,6 +276,8 @@ set(SRC_UTIL_HEADERS ../util/util_types_float3_impl.h ../util/util_types_float4.h ../util/util_types_float4_impl.h + ../util/util_types_float8.h + ../util/util_types_float8_impl.h ../util/util_types_int2.h ../util/util_types_int2_impl.h ../util/util_types_int3.h diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index de1503e5564..2ad55d041bf 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN /* Common QBVH functions. */ #ifdef __QBVH__ # include "kernel/bvh/qbvh_nodes.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_nodes.h" +#endif #endif /* Regular BVH traversal */ diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h index 6356c197dd1..8def71bc890 100644 --- a/intern/cycles/kernel/bvh/bvh_local.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_local.h" +# ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_local.h" +# endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -254,6 +257,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, int max_hits) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + local_isect, + local_object, + lcg_state, + max_hits); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index cfc567ff9ca..d525b29fd94 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_shadow_all.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_shadow_all.h" +#endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, uint *num_hits) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect_array, + visibility, + max_hits, + num_hits); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 551625eae78..e95d2408201 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -20,6 +20,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_traversal.h" #endif +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_traversal.h" +#endif #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect @@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, ) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect, + visibility +# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + , lcg_state, + difl, + extmax +# endif + ); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index ead424aaaaf..4ca0dc2225e 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN /* 64 object BVH + 64 mesh BVH + 64 object node splitting */ #define BVH_STACK_SIZE 192 #define BVH_QSTACK_SIZE 384 - +#define BVH_OSTACK_SIZE 768 /* BVH intersection function variations */ #define BVH_INSTANCING 1 diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index ce5fc7be33d..7d03855cb8f 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_volume.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_volume.h" +#endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const uint visibility) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect, + visibility); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 2ee29ac9c27..3d9b598914f 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -19,6 +19,9 @@ #ifdef __QBVH__ # include "kernel/bvh/qbvh_volume_all.h" +#ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_volume_all.h" +#endif #endif #if BVH_FEATURE(BVH_HAIR) @@ -386,6 +389,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, const uint visibility) { switch(kernel_data.bvh.bvh_layout) { +#ifdef __KERNEL_AVX2__ + case BVH_LAYOUT_BVH8: + return BVH_FUNCTION_FULL_NAME(OBVH)(kg, + ray, + isect_array, + max_hits, + visibility); +#endif #ifdef __QBVH__ case BVH_LAYOUT_BVH4: return BVH_FUNCTION_FULL_NAME(QBVH)(kg, diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h new file mode 100644 index 00000000000..50bcfa79b6c --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_local.h @@ -0,0 +1,409 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for subsurface scattering, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + LocalIntersection *local_isect, + int local_object, + uint *lcg_state, + int max_hits) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, local_object); + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = ray->t; + + local_isect->num_hits = 0; + + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; + isect_t = bvh_instance_motion_push(kg, + local_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); +#else + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); +#endif + object = local_object; + } + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + avxf tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); + + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + /* Intersect ray against primitive, */ + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) + { + return true; + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + /* Intersect ray against primitive. */ + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits)) + { + return true; + } + } + break; + } +#endif + default: + break; + } + } + } while(node_addr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); + return false; +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h new file mode 100644 index 00000000000..93f35f6dffb --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_nodes.h @@ -0,0 +1,532 @@ +/* + * Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Aligned nodes intersection AVX code is adopted from Embree, + */ + +struct OBVHStackItem { + int addr; + float dist; +}; + +ccl_device_inline void obvh_near_far_idx_calc(const float3& idir, + int *ccl_restrict near_x, + int *ccl_restrict near_y, + int *ccl_restrict near_z, + int *ccl_restrict far_x, + int *ccl_restrict far_y, + int *ccl_restrict far_z) + +{ +#ifdef __KERNEL_SSE__ + *near_x = 0; *far_x = 1; + *near_y = 2; *far_y = 3; + *near_z = 4; *far_z = 5; + + const size_t mask = movemask(ssef(idir.m128)); + + const int mask_x = mask & 1; + const int mask_y = (mask & 2) >> 1; + const int mask_z = (mask & 4) >> 2; + + *near_x += mask_x; *far_x -= mask_x; + *near_y += mask_y; *far_y -= mask_y; + *near_z += mask_z; *far_z -= mask_z; +#else + if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; } + if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; } + if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; } +#endif +} + +ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, + OBVHStackItem *ccl_restrict b) +{ + OBVHStackItem tmp = *a; + *a = *b; + *b = tmp; +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3) +{ + if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } + if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } + if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4) +{ + if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } + if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); } + if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); } + if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); } + if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5) +{ + obvh_stack_sort(s1, s2, s3, s4); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5, + OBVHStackItem *ccl_restrict s6) +{ + obvh_stack_sort(s1, s2, s3, s4, s5); + if(s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5, + OBVHStackItem *ccl_restrict s6, + OBVHStackItem *ccl_restrict s7) +{ + obvh_stack_sort(s1, s2, s3, s4, s5, s6); + if(s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if(s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } +} + +ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, + OBVHStackItem *ccl_restrict s2, + OBVHStackItem *ccl_restrict s3, + OBVHStackItem *ccl_restrict s4, + OBVHStackItem *ccl_restrict s5, + OBVHStackItem *ccl_restrict s6, + OBVHStackItem *ccl_restrict s7, + OBVHStackItem *ccl_restrict s8) +{ + obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); + if(s8->dist < s7->dist) { + obvh_item_swap(s7, s8); + if(s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if(s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if(s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if(s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if(s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if(s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } + } +} + +/* Axis-aligned nodes intersection */ + +ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& org_idir, +#else + const avx3f& org, +#endif + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) +{ + const int offset = node_addr + 2; +#ifdef __KERNEL_AVX2__ + const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x); + const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y); + const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z); + const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x); + const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y); + const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z); + + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; +#else + return 0; +#endif +} + +ccl_device_inline int obvh_aligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& P_idir, +#else + const avx3f& P, +#endif + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) +{ + const int offset = node_addr + 2; +#ifdef __KERNEL_AVX2__ + const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x); + const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x); + const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y); + const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y); + const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z); + const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z); + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = round_down*tnear <= round_up*tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; +#else + return 0; +#endif +} + +/* Unaligned nodes intersection */ + +ccl_device_inline int obvh_unaligned_node_intersect( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& org_idir, +#endif + const avx3f& org, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); + + const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, + aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, + aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); +} + +ccl_device_inline int obvh_unaligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& P_idir, +#endif + const avx3f& P, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); + + const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, + aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, + aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = round_down*tnear <= round_up*tfar; + *dist = tnear; + return movemask(vmask); +} + +/* Intersectors wrappers. + * + * They'll check node type and call appropriate intersection code. + */ + +ccl_device_inline int obvh_node_intersect( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& org_idir, +#endif + const avx3f& org, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#endif + org, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } + else { + return obvh_aligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#else + org, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } +} + +ccl_device_inline int obvh_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const avxf& isect_near, + const avxf& isect_far, +#ifdef __KERNEL_AVX2__ + const avx3f& P_idir, +#endif + const avx3f& P, + const avx3f& dir, + const avx3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#endif + P, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } + else { + return obvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#else + P, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } +} diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h new file mode 100644 index 00000000000..3e877065127 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h @@ -0,0 +1,687 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const int skip_object, + const uint max_hits, + uint *num_hits) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + + *num_hits = 0; + isect_array->t = tmax; + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + avxf tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; + + if(false +#ifdef __VISIBILITY_FLAG__ + || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) +#endif +#if BVH_FEATURE(BVH_MOTION) + || UNLIKELY(ray->time < inodes.y) + || UNLIKELY(ray->time > inodes.z) +#endif + ) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) +//#if !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + if(p_type == PRIMITIVE_TRIANGLE) { + int prim_count = prim_addr2 - prim_addr; + if(prim_count < 3) { + while(prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + int hit = triangle_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr); + /* Shadow ray early termination. */ + if(hit) { + /* detect if this surface has a shader with transparent shadows */ + + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; + +#ifdef __HAIR__ + if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) +#endif + { + shader = kernel_tex_fetch(__tri_shader, prim); + } +#ifdef __HAIR__ + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } +#endif + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if(*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + + isect_array->t = isect_t; + } + + prim_addr++; + } //while + } else { + kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type); + +#if BVH_FEATURE(BVH_INSTANCING) + int* nhiptr = &num_hits_in_instance; +#else + int nhi= 0; + int *nhiptr = &nhi; +#endif + + int result = triangle_intersect8(kg, + &isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + prim_count, + num_hits, + max_hits, + nhiptr, + isect_t); + if(result == 2) { + return true; + } + } // prim_count + } // PRIMITIVE_TRIANGLE + else { + while(prim_addr < prim_addr2) { + kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); + +#ifdef __SHADOW_TRICKS__ + uint tri_object = (object == OBJECT_NONE) + ? kernel_tex_fetch(__prim_object, prim_addr) + : object; + if(tri_object == skip_object) { + ++prim_addr; + continue; + } +#endif + + bool hit; + + /* todo: specialized intersect functions which don't fill in + * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? + * might give a few % performance improvement */ + + switch(p_type) { + +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + PATH_RAY_SHADOW, + object, + prim_addr); + break; + } +#endif +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); + } + else { + hit = curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); + } + break; + } +#endif + default: { + hit = false; + break; + } + } + + /* Shadow ray early termination. */ + if(hit) { + /* detect if this surface has a shader with transparent shadows */ + + /* todo: optimize so primitive visibility flag indicates if + * the primitive has a transparent shadow shader? */ + int prim = kernel_tex_fetch(__prim_index, isect_array->prim); + int shader = 0; + +#ifdef __HAIR__ + if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) +#endif + { + shader = kernel_tex_fetch(__tri_shader, prim); + } +#ifdef __HAIR__ + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } +#endif + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return true; + } + /* if maximum number of hits reached, block all light */ + else if(*num_hits == max_hits) { + return true; + } + + /* move on to next entry in intersections array */ + isect_array++; + (*num_hits)++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + + isect_array->t = isect_t; + } + + prim_addr++; + }//while prim + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + +# if BVH_FEATURE(BVH_MOTION) + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); +# else + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); +# endif + + num_hits_in_instance = 0; + isect_array->t = isect_t; + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ + if(num_hits_in_instance) { + float t_fac; +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); +# else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +# endif + /* Scale isect->t to adjust for instancing. */ + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } + } + else { +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); +# else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); +# endif + } + + isect_t = tmax; + isect_array->t = isect_t; + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return false; +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h new file mode 100644 index 00000000000..2021d8e1143 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_traversal.h @@ -0,0 +1,642 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +# define NODE_INTERSECT_ROBUST obvh_node_intersect_robust +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + ,uint *lcg_state, + float difl, + float extmax +#endif + ) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + traversal_stack[0].dist = -FLT_MAX; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + float node_dist = -FLT_MAX; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + BVH_DEBUG_INIT(); + avxf tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; + + if(UNLIKELY(node_dist > isect->t) +#if BVH_FEATURE(BVH_MOTION) + || UNLIKELY(ray->time < inodes.y) + || UNLIKELY(ray->time > inodes.z) +#endif +#ifdef __VISIBILITY_FLAG__ + || (__float_as_uint(inodes.x) & visibility) == 0 +#endif + ) + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + int child_mask; + avxf dist; + + BVH_DEBUG_NEXT_NODE(); + +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + child_mask = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, +# ifdef __KERNEL_AVX2__ + P_idir4, +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + &dist); + } + else +#endif /* BVH_HAIR_MINIMUM_WIDTH */ + { + child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + } + + if(child_mask != 0) { + avxf cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here), + */ +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + float d0 = ((float*)&dist)[r]; + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + node_dist = d0; + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + node_dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + node_dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + +#ifdef __VISIBILITY_FLAG__ + if(UNLIKELY((node_dist > isect->t) || + ((__float_as_uint(leaf.z) & visibility) == 0))) +#else + if(UNLIKELY((node_dist > isect->t))) +#endif + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + + /* Primitive intersection. */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + int prim_count = prim_addr2 - prim_addr; + if(prim_count < 3) { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr)) + { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + }//for + } + else { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect8(kg, + &isect, + P, + dir, + visibility, + object, + prim_addr, + prim_count, + 0, + 0, + NULL, + 0.0f)) + { + tfar = avxf(isect->t); + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + }//prim count + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr)) + { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_INTERSECTION(); + const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); + kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); + bool hit; + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + else { + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); + } + if(hit) { + tfar = avxf(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + +# if BVH_FEATURE(BVH_MOTION) + qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); +# else + qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + traversal_stack[stack_ptr].dist = -FLT_MAX; + + node_addr = kernel_tex_fetch(__object_node, object); + + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +# if BVH_FEATURE(BVH_MOTION) + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); +# else + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h new file mode 100644 index 00000000000..da9ddbd4f24 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_volume.h @@ -0,0 +1,483 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + avxf tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + + if((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch(p_type) { + case PRIMITIVE_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); + } + break; + } +#endif + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_HAS_VOLUME) { +# if BVH_FEATURE(BVH_MOTION) + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); +# else + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +# if BVH_FEATURE(BVH_MOTION) + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); +# else + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h new file mode 100644 index 00000000000..a88573e6f86 --- /dev/null +++ b/intern/cycles/kernel/bvh/obvh_volume_all.h @@ -0,0 +1,554 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT obvh_node_intersect +#else +# define NODE_INTERSECT obvh_aligned_node_intersect +#endif + +ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + const uint visibility) +{ + /* Traversal stack in CUDA thread-local memory. */ + OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + + /* Ray parameters in registers. */ + const float tmax = ray->t; + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + float isect_t = tmax; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + + uint num_hits = 0; + isect_array->t = tmax; + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return 0; + } +#endif + +#if BVH_FEATURE(BVH_INSTANCING) + int num_hits_in_instance = 0; +#endif + + avxf tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +#endif + avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + + avxf dist; + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + avxf cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26); + } + else +#endif + { + cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + float d0 = ((float*)&dist)[r]; + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + + /* Five children are hit, push all onto stack and sort 5 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c4 = __float_as_int(cnodes[r]); + float d4 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Six children are hit, push all onto stack and sort 6 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c5 = __float_as_int(cnodes[r]); + float d5 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c5; + traversal_stack[stack_ptr].dist = d5; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c4; + traversal_stack[stack_ptr].dist = d4; + + /* Seven children are hit, push all onto stack and sort 7 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c6 = __float_as_int(cnodes[r]); + float d6 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + /* Eight children are hit, push all onto stack and sort 8 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c7 = __float_as_int(cnodes[r]); + float d7 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c7; + traversal_stack[stack_ptr].dist = d7; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = c6; + traversal_stack[stack_ptr].dist = d6; + obvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3], + &traversal_stack[stack_ptr - 4], + &traversal_stack[stack_ptr - 5], + &traversal_stack[stack_ptr - 6], + &traversal_stack[stack_ptr - 7]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + + if((__float_as_uint(leaf.z) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + const uint p_type = type & PRIMITIVE_ALL; + bool hit; + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + + /* Primitive intersection. */ + switch(p_type) { + case PRIMITIVE_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +#if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +#endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +#if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); +# else + Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); +# endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +#endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + /* Only primitives from volume object. */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + /* Intersect ray against primitive. */ + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + if(hit) { + /* Move on to next entry in intersections array. */ + isect_array++; + num_hits++; +# if BVH_FEATURE(BVH_INSTANCING) + num_hits_in_instance++; +# endif + isect_array->t = isect_t; + if(num_hits == max_hits) { +# if BVH_FEATURE(BVH_INSTANCING) +# if BVH_FEATURE(BVH_MOTION) + float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); +# else + Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + float t_fac = 1.0f / len(transform_direction(&itfm, dir)); +# endif + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } +# endif /* BVH_FEATURE(BVH_INSTANCING) */ + return num_hits; + } + } + } + break; + } +#endif + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_HAS_VOLUME) { +# if BVH_FEATURE(BVH_MOTION) + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); +# else + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); +# endif + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + num_hits_in_instance = 0; + isect_array->t = isect_t; + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_OSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + + node_addr = kernel_tex_fetch(__object_node, object); + } + else { + /* Pop. */ + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ + if(num_hits_in_instance) { + float t_fac; +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); +# else + bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); +# endif + /* Scale isect->t to adjust for instancing. */ + for(int i = 0; i < num_hits_in_instance; i++) { + (isect_array-i-1)->t *= t_fac; + } + } + else { +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); +# else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); +# endif + } + + isect_t = tmax; + isect_array->t = isect_t; + + obvh_near_far_idx_calc(idir, + &near_x, &near_y, &near_z, + &far_x, &far_y, &far_z); + tfar = avxf(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); +# endif + idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); +# endif + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return num_hits; +} + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 3036efd4198..2e622af1758 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, /* Axis-aligned nodes intersection */ -ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, +//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, +static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index a2b1e050e58..f8c671fed14 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -1,4 +1,4 @@ -/* + /* * Copyright 2014, Blender Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -70,7 +70,479 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, return false; } -/* Special ray intersection routines for local intersection. In that case we +#define cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D)) +#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 +ccl_device_inline +#else +ccl_device_forceinline +#endif +int ray_triangle_intersect8(KernelGlobals *kg, + float3 ray_P, + float3 ray_dir, + Intersection **isect, + uint visibility, + int object, + __m256 *triA, + __m256 *triB, + __m256 *triC, + int prim_addr, + int prim_num, + uint *num_hits, + uint max_hits, + int *num_hits_in_instance, + float isec_t) +{ + + const unsigned char prim_num_mask = (1 << prim_num) - 1; + + const __m256i zero256 = _mm256_setzero_si256(); + + const __m256 Px256 = _mm256_set1_ps(ray_P.x); + const __m256 Py256 = _mm256_set1_ps(ray_P.y); + const __m256 Pz256 = _mm256_set1_ps(ray_P.z); + + const __m256 dirx256 = _mm256_set1_ps(ray_dir.x); + const __m256 diry256 = _mm256_set1_ps(ray_dir.y); + const __m256 dirz256 = _mm256_set1_ps(ray_dir.z); + + /* Calculate vertices relative to ray origin. */ + /* const float3 v0 = tri_c - P; + const float3 v1 = tri_a - P; + const float3 v2 = tri_b - P; */ + + __m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256); + __m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256); + __m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256); + + __m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256); + __m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256); + __m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256); + + __m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256); + __m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256); + __m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256); + + __m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256); + __m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256); + __m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256); + + __m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256); + __m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256); + __m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256); + + __m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256); + __m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256); + __m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256); + + /* Calculate triangle edges. + const float3 e0 = v2 - v0; + const float3 e1 = v0 - v1; + const float3 e2 = v1 - v2;*/ + + __m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256); + __m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256); + __m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256); + + __m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256); + __m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256); + __m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256); + + __m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256); + __m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256); + __m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256); + + /* Perform edge tests. + const float U = dot(cross(v2 + v0, e0), ray_dir); + const float V = dot(cross(v0 + v1, e1), ray_dir); + const float W = dot(cross(v1 + v2, e2), ray_dir);*/ + + //cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx) + __m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256); + __m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256); + __m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256); + //vertical dot + __m256 U_256 = _mm256_mul_ps(U_x_256, dirx256); + U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256)); + U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256)); + + __m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256); + __m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256); + __m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256); + //vertical dot + __m256 V_256 = _mm256_mul_ps(V_x_256, dirx256); + V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256)); + V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256)); + + __m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256); + __m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256); + __m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256); + //vertical dot + __m256 W_256 = _mm256_mul_ps(W_x_256, dirx256); + W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256)); + W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256)); + + //const float minUVW = min(U, min(V, W)); + //const float maxUVW = max(U, max(V, W)); +#if 0 + __m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256)); + __m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256)); + + //if(minUVW < 0.0f && maxUVW > 0.0f) + __m256i mask_minmaxUVW_256 = _mm256_and_si256( + _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)), + //_mm256_castps_si256(minUVW_256), + _mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256)); +#else + __m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31); + __m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31); + __m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31); + __m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1); + + const __m256i one256 = _mm256_set1_epi32(1); + const __m256i two256 = _mm256_set1_epi32(2); + + __m256i mask_minmaxUVW_256 = _mm256_or_si256( + _mm256_cmpeq_epi32(one256, UVW_256_1), + _mm256_cmpeq_epi32(two256, UVW_256_1) ); +#endif + + unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256)); + if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set + return false; + } + + /* Calculate geometry normal and denominator. */ + // const float3 Ng1 = cross(e1, e0); + //const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); + + __m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256); + __m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256); + __m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256); + + //const float3 Ng = Ng1 + Ng1; + Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256); + Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256); + Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256); + + //const float den = dot3(Ng, dir); + //vertical dot + __m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256); + den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256)); + den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256)); + + // __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256); + + /* Perform depth test. */ + //const float T = dot3(v0, Ng); + __m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256); + T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256)); + T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256)); + + //const int sign_den = (__float_as_int(den) & 0x80000000); + const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000); + __m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000); + + //const float sign_T = xor_signmask(T, sign_den); + __m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256)); + + /*if((sign_T < 0.0f) || mask_minmaxUVW_pos { return false;} */ + unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256); + if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) { + return false; + } /**/ + + __m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)); + + + ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8]; + ccl_align(32) unsigned int mask_minmaxUVW8[8]; + + if(visibility == PATH_RAY_SHADOW_OPAQUE){ + __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256 + + __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256); + + __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256)); + __m256 rayt_256 = _mm256_set1_ps((*isect)->t); + + __m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256), + _mm256_castps_si256( + _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256) + ) + ); + /* __m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256, + _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256), + _CMP_GT_OS + ) );*/ + + mask0 = _mm256_or_si256(mask1, mask0); + //unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0)); + //unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256)); + //unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos)); + mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) + mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden) + + unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256)); + if((mask_final & prim_num_mask) == 0) { //all bits NOT set + return false; + } /**/ + + unsigned long i = 0; +#if defined(_MSC_VER) + unsigned char res = _BitScanForward(&i, (unsigned long)mask_final); +#else + i = __builtin_ffs(mask_final)-1; +#endif + + den_256 = _mm256_rcp_ps(den_256); //inv_den + U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den + V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den + T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den + + _mm256_store_ps(U8, U_256); + _mm256_store_ps(V8, V_256); + _mm256_store_ps(T8, T_256); + + + //here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true + + (*isect)->u = U8[i]; + (*isect)->v = V8[i]; + (*isect)->t = T8[i]; + + (*isect)->prim = (prim_addr + i); + (*isect)->object = object; + (*isect)->type = PRIMITIVE_TRIANGLE; + + return true; + } + else { + _mm256_store_ps(den8, den_256); + _mm256_store_ps(U8, U_256); + _mm256_store_ps(V8, V_256); + _mm256_store_ps(T8, T_256); + + _mm256_store_ps(sign_T8, sign_T_256); + _mm256_store_ps(xor_signmask8, xor_signmask_256); + _mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256); + + int ret = false; + + if(visibility == PATH_RAY_SHADOW) { + for(int i = 0; i < prim_num; i++) { + if(!mask_minmaxUVW8[i]) { +#ifdef __VISIBILITY_FLAG__ + if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) +#endif + { + if((sign_T8[i] >= 0.0f) && + (sign_T8[i] <= (*isect)->t * xor_signmask8[i])) + { + if(den8[i]) { + const float inv_den = 1.0f / den8[i]; + + (*isect)->u = U8[i] * inv_den; + (*isect)->v = V8[i] * inv_den; + (*isect)->t = T8[i] * inv_den; + + (*isect)->prim = (prim_addr + i); + (*isect)->object = object; + (*isect)->type = PRIMITIVE_TRIANGLE; + + int prim = kernel_tex_fetch(__prim_index, (*isect)->prim); + int shader = 0; + +#ifdef __HAIR__ + if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE) +#endif + { + shader = kernel_tex_fetch(__tri_shader, prim); + } +#ifdef __HAIR__ + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } +#endif + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + + /* if no transparent shadows, all light is blocked */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return 2; + } + /* if maximum number of hits reached, block all light */ + else if(*num_hits == max_hits) { + return 2; + } + /* move on to next entry in intersections array */ + ret = true; + + (*isect)++; + (*num_hits)++; + + (*num_hits_in_instance)++; + + (*isect)->t = isec_t; + + } //den + } //if sign + } //vis + }//if mask + } //for + } + else { //default case + for(int i = 0; i < prim_num; i++) { + if(!mask_minmaxUVW8[i]) { +#ifdef __VISIBILITY_FLAG__ + if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) +#endif + { + if((sign_T8[i] >= 0.0f) && + (sign_T8[i] <= (*isect)->t * xor_signmask8[i])) + { + if(den8[i]) { + const float inv_den = 1.0f / den8[i]; + + (*isect)->u = U8[i] * inv_den; + (*isect)->v = V8[i] * inv_den; + (*isect)->t = T8[i] * inv_den; + + (*isect)->prim = (prim_addr + i); + (*isect)->object = object; + (*isect)->type = PRIMITIVE_TRIANGLE; + + ret = true; + } //den + } //if sign + } //vis + }//if mask + } //for + } //default + return ret; +}// else PATH_RAY_SHADOW_OPAQUE + +} + +//vz static +ccl_device_inline +int triangle_intersect8(KernelGlobals *kg, + Intersection **isect, + float3 P, + float3 dir, + uint visibility, + int object, + int prim_addr, + int prim_num, + uint *num_hits, + uint max_hits, + int *num_hits_in_instance, + float isec_t) + { + __m128 tri_a[8], tri_b[8], tri_c[8]; + __m256 tritmp[12], tri[12]; + __m256 triA[3], triB[3], triC[3]; + + int i, r; + + uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); + for(i = 0; i < prim_num; i++) { + tri_a[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++]; + tri_b[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++]; + tri_c[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++]; + } + //create 9 or 12 placeholders + tri[0] = _mm256_castps128_ps256(tri_a[0]); //_mm256_zextps128_ps256 + tri[1] = _mm256_castps128_ps256(tri_b[0]);//_mm256_zextps128_ps256 + tri[2] = _mm256_castps128_ps256(tri_c[0]);//_mm256_zextps128_ps256 + + tri[3] = _mm256_castps128_ps256(tri_a[1]); //_mm256_zextps128_ps256 + tri[4] = _mm256_castps128_ps256(tri_b[1]);//_mm256_zextps128_ps256 + tri[5] = _mm256_castps128_ps256(tri_c[1]);//_mm256_zextps128_ps256 + + tri[6] = _mm256_castps128_ps256(tri_a[2]); //_mm256_zextps128_ps256 + tri[7] = _mm256_castps128_ps256(tri_b[2]);//_mm256_zextps128_ps256 + tri[8] = _mm256_castps128_ps256(tri_c[2]);//_mm256_zextps128_ps256 + + if(prim_num > 3) { + tri[9] = _mm256_castps128_ps256(tri_a[3]); //_mm256_zextps128_ps256 + tri[10] = _mm256_castps128_ps256(tri_b[3]);//_mm256_zextps128_ps256 + tri[11] = _mm256_castps128_ps256(tri_c[3]);//_mm256_zextps128_ps256 + } + + for(i = 4, r = 0; i < prim_num; i ++, r += 3) { + tri[r] = _mm256_insertf128_ps(tri[r] , tri_a[i], 1); + tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1); + tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1); + } + + //------------------------------------------------ + //0! Xa0 Ya0 Za0 1 Xa4 Ya4 Za4 1 + //1! Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1 + //2! Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1 + + //3! Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1 + //4! Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5 1 + //5! Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1 + + //6! Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1 + //7! Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6 1 + //8! Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1 + + //9! Xa3 Ya3 Za3 1 Xa7 Ya7 Za7 1 + //10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7 1 + //11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7 1 + + //"transpose" + tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]); //0! Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5 + tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]); //1! Za0 Za1 1 1 Za4 Za5 1 1 + + tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]); //2! Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7 + tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]); //3! Za2 Za3 1 1 Za6 Za7 1 1 + + tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]); //4! Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5 + tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]); //5! Zb0 Zb1 1 1 Zb4 Zb5 1 1 + + tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]); //6! Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7 + tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]); //7! Zb2 Zb3 1 1 Zb6 Zb7 1 1 + + tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]); //8! Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5 + tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]); //9! Zc0 Zc1 1 1 Zc4 Zc5 1 1 + + tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7 + tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); //11! Zc2 Zc3 1 1 Zc6 Zc7 1 1 + + /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ + triA[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7 + triA[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7 + triA[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), _mm256_castps_pd(tritmp[3]))); // Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7 + + triB[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Xb0 Xb1 Xb2 Xb3 Xb4 Xb5 Xb5 Xb7 + triB[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Yb0 Yb1 Yb2 Yb3 Yb4 Yb5 Yb5 Yb7 + triB[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), _mm256_castps_pd(tritmp[7]))); // Zb0 Zb1 Zb2 Zb3 Zb4 Zb5 Zb5 Zb7 + + triC[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7 + triC[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7 + triC[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), _mm256_castps_pd(tritmp[11]))); //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7 + + /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ + + int result = ray_triangle_intersect8(kg, P, + dir, + isect, + visibility, object, + triA, + triB, + triC, + prim_addr, + prim_num, + num_hits, + max_hits, + num_hits_in_instance, + isec_t); + return result; +} + +/* Special ray intersection routines for subsurface scattering. In that case we * only want to intersect with primitives in the same object, and if case of * multiple hits we pick a single random primitive as the intersection point. * Returns whether traversal should be stopped. @@ -83,7 +555,7 @@ ccl_device_inline bool triangle_intersect_local( float3 P, float3 dir, int object, - int local_object, + int local_object, int prim_addr, float tmax, uint *lcg_state, diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 2f955741797..aa7a16afa1d 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -71,15 +71,13 @@ CCL_NAMESPACE_BEGIN /* Texture types to be compatible with CUDA textures. These are really just * simple arrays and after inlining fetch hopefully revert to being a simple * pointer lookup. */ - template<typename T> struct texture { ccl_always_inline const T& fetch(int index) { kernel_assert(index >= 0 && index < width); return data[index]; } - -#ifdef __KERNEL_AVX__ +#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain * compatibility with existing indicies and data structures. */ @@ -90,7 +88,6 @@ template<typename T> struct texture { ssef *ssef_node_data = &ssef_data[index]; return _mm256_loadu_ps((float *)ssef_node_data); } - #endif #ifdef __KERNEL_SSE2__ @@ -148,6 +145,10 @@ ccl_device_inline void print_sse3i(const char *label, sse3i& a) print_ssei(label, a.z); } +#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +typedef vector3<avxf> avx3f; +#endif + #endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index d169915cff9..e93100a6442 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -1384,8 +1384,9 @@ typedef enum KernelBVHLayout { BVH_LAYOUT_BVH2 = (1 << 0), BVH_LAYOUT_BVH4 = (1 << 1), + BVH_LAYOUT_BVH8 = (1 << 2), - BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4, + BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8, BVH_LAYOUT_ALL = (unsigned int)(-1), } KernelBVHLayout; diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 508f44e7c4d..291f9a9fcae 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -78,6 +78,7 @@ set(SRC_HEADERS util_sky_model.h util_sky_model_data.h util_avxf.h + util_avxb.h util_sseb.h util_ssef.h util_ssei.h @@ -98,7 +99,9 @@ set(SRC_HEADERS util_types_float3_impl.h util_types_float4.h util_types_float4_impl.h - util_types_int2.h + util_types_float8.h + util_types_float8_impl.h + util_types_int2.h util_types_int2_impl.h util_types_int3.h util_types_int3_impl.h diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h new file mode 100644 index 00000000000..865549be283 --- /dev/null +++ b/intern/cycles/util/util_avxb.h @@ -0,0 +1,192 @@ +/* + * Copyright 2011-2013 Intel Corporation + * Modifications Copyright 2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_AVXB_H__ +#define __UTIL_AVXB_H__ + +CCL_NAMESPACE_BEGIN + +struct avxf; + +/*! 4-wide SSE bool type. */ +struct avxb +{ + typedef avxb Mask; // mask type + typedef avxf Float; // float type + + enum { size = 8 }; // number of SIMD elements + union { __m256 m256; int32_t v[8]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxb ( ) {} + __forceinline avxb ( const avxb& other ) { m256 = other.m256; } + __forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; } + + __forceinline avxb( const __m256 input ) : m256(input) {} + __forceinline operator const __m256&( void ) const { return m256; } + __forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); } + __forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); } + + //__forceinline avxb ( bool a ) + // : m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + //__forceinline avxb ( bool a, bool b) + // : m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} + //__forceinline avxb ( bool a, bool b, bool c, bool d) + // : m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + //__forceinline avxb(int mask) { + // assert(mask >= 0 && mask < 16); + // m128 = _mm_lookupmask_ps[mask]; + //} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {} + __forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; } + __forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); } + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); } +__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); } +__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); } + +//////////////////////////////////////////////////////////////////////////////// +/// Assignment Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; } +__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; } +__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; } + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); } +__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + +__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) { +#if defined(__KERNEL_SSE41__) + return _mm256_blendv_ps(f, t, m); +#else + return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f)); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); } +__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); } + +#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \ + ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7> +__forceinline const avxb shuffle( const avxb& a ) { + return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0))); +} + +/* +template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) { + return _mm_movelh_ps(a, a); +} + +template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) { + return _mm_movehl_ps(a, a); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) { + return _mm_movelh_ps(a, b); +} + +template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) { + return _mm_movehl_ps(b, a); +} + +#if defined(__KERNEL_SSE3__) +template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); } +template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); } +#endif + +#if defined(__KERNEL_SSE41__) +template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } +template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); } +template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); } +#endif +*/ + +//////////////////////////////////////////////////////////////////////////////// +/// Reduction Operations +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_SSE41__) +__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); } +#else +__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+ + bool(a[5])+bool(a[6])+bool(a[7]); } +#endif + +__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; } +__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; } +__forceinline bool all ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; } +__forceinline bool any ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; } +__forceinline bool none ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; } + +__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); } + +//////////////////////////////////////////////////////////////////////////////// +/// Debug Functions +//////////////////////////////////////////////////////////////////////////////// + +ccl_device_inline void print_avxb(const char *label, const avxb &a) +{ + printf("%s: %df %df %df %df %df %df %df %d\n", + label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); +} + +#endif + +CCL_NAMESPACE_END + +//#endif + diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h index 2451213963a..81365f96c94 100644 --- a/intern/cycles/util/util_avxf.h +++ b/intern/cycles/util/util_avxf.h @@ -19,7 +19,8 @@ CCL_NAMESPACE_BEGIN -#ifdef __KERNEL_AVX__ +struct avxb; + struct avxf { typedef avxf Float; @@ -53,6 +54,9 @@ struct avxf __forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {} + __forceinline avxf(float3 a) : + m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {} + __forceinline avxf(int a3, int a2, int a1, int a0) { @@ -73,8 +77,24 @@ struct avxf m256 = _mm256_insertf128_ps(foo, b, 1); } + __forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; } + __forceinline float& operator [](const size_t i) { assert(i < 8); return f[i]; } }; +__forceinline avxf cross(const avxf& a, const avxf& b) +{ + avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5], + 0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]); + return r; +} + +__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2) +{ + const avxf t = _mm256_mul_ps(a.m256, b.m256); + den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]; + den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6]; +} + //////////////////////////////////////////////////////////////////////////////// /// Unary Operators //////////////////////////////////////////////////////////////////////////////// @@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256 __forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); } +__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); } +__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); } + //////////////////////////////////////////////////////////////////////////////// /// Movement/Shifting/Shuffling Functions //////////////////////////////////////////////////////////////////////////////// @@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b) return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b); } +//#if defined(__KERNEL_SSE41__) +__forceinline avxf maxi(const avxf& a, const avxf& b) { + const avxf ci = _mm256_max_ps(a, b); + return ci; +} + +__forceinline avxf mini(const avxf& a, const avxf& b) { + const avxf ci = _mm256_min_ps(a, b); + return ci; +} +//#endif + //////////////////////////////////////////////////////////////////////////////// /// Ternary Operators //////////////////////////////////////////////////////////////////////////////// @@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) { return c-(a*b); #endif } +__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) { + return _mm256_fmsub_ps(a, b, c); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators +//////////////////////////////////////////////////////////////////////////////// +#ifdef __KERNEL_AVX2__ +__forceinline const avxb operator <=(const avxf& a, const avxf& b) { + return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS); +} +#endif + #endif #ifndef _mm256_set_m128 @@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) { CCL_NAMESPACE_END -#endif diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 9a66a372822..a761c9c46de 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -57,7 +57,19 @@ void DebugFlags::CPU::reset() #undef STRINGIFY #undef CHECK_CPU_FLAGS - bvh_layout = BVH_LAYOUT_DEFAULT; + if (getenv("CYCLES_BVH2") != NULL) { + bvh_layout = BVH_LAYOUT_BVH2; + } + else if (getenv("CYCLES_BVH4") != NULL) { + bvh_layout = BVH_LAYOUT_BVH4; + } + else if (getenv("CYCLES_BVH8") != NULL) { + bvh_layout = BVH_LAYOUT_BVH8; + } + else { + bvh_layout = BVH_LAYOUT_DEFAULT; + } + split_kernel = false; } diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index dfe755a8789..96c549b9be5 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -121,6 +121,7 @@ CCL_NAMESPACE_END #include "util/util_types_float2.h" #include "util/util_types_float3.h" #include "util/util_types_float4.h" +#include "util/util_types_float8.h" #include "util/util_types_vector3.h" @@ -140,6 +141,7 @@ CCL_NAMESPACE_END #include "util/util_types_float2_impl.h" #include "util/util_types_float3_impl.h" #include "util/util_types_float4_impl.h" +#include "util/util_types_float8_impl.h" #include "util/util_types_vector3_impl.h" @@ -148,7 +150,10 @@ CCL_NAMESPACE_END # include "util/util_sseb.h" # include "util/util_ssei.h" # include "util/util_ssef.h" +#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +# include "util/util_avxb.h" # include "util/util_avxf.h" #endif +#endif #endif /* __UTIL_TYPES_H__ */ diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h new file mode 100644 index 00000000000..2e693d49634 --- /dev/null +++ b/intern/cycles/util/util_types_float8.h @@ -0,0 +1,70 @@ +/* +Copyright (c) 2017, Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of Intel Corporation nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __UTIL_TYPES_FLOAT8_H__ +#define __UTIL_TYPES_FLOAT8_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ + +struct ccl_try_align(16) float8 { +#ifdef __KERNEL_AVX2__ + union { + __m256 m256; + struct { float a, b, c, d, e, f, g, h; }; + }; + + __forceinline float8(); + __forceinline float8(const float8& a); + __forceinline explicit float8(const __m256& a); + + __forceinline operator const __m256&(void) const; + __forceinline operator __m256&(void); + + __forceinline float8& operator =(const float8& a); + +#else /* __KERNEL_AVX2__ */ + float a, b, c, d, e, f, g, h; +#endif /* __KERNEL_AVX2__ */ + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float8 make_float8(float f); +ccl_device_inline float8 make_float8(float a, float b, float c, float d, + float e, float f, float g, float h); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT8_H__ */ diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h new file mode 100644 index 00000000000..4fac03569e9 --- /dev/null +++ b/intern/cycles/util/util_types_float8_impl.h @@ -0,0 +1,113 @@ +/* +Copyright (c) 2017, Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of Intel Corporation nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__ +#define __UTIL_TYPES_FLOAT8_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_AVX2__ +__forceinline float8::float8() +{ +} + +__forceinline float8::float8(const float8& f) + : m256(f.m256) +{ +} + +__forceinline float8::float8(const __m256& f) + : m256(f) +{ +} + +__forceinline float8::operator const __m256&(void) const +{ + return m256; +} + +__forceinline float8::operator __m256&(void) +{ + return m256; +} + +__forceinline float8& float8::operator =(const float8& f) +{ + m256 = f.m256; + return *this; +} +#endif /* __KERNEL_AVX2__ */ + +__forceinline float float8::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 8); + return *(&a + i); +} + +__forceinline float& float8::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 8); + return *(&a + i); +} + +ccl_device_inline float8 make_float8(float f) +{ +#ifdef __KERNEL_AVX2__ + float8 r(_mm256_set1_ps(f)); +#else + float8 r = {f, f, f, f, f, f, f, f}; +#endif + return r; +} + +ccl_device_inline float8 make_float8(float a, float b, float c, float d, + float e, float f, float g, float h) +{ +#ifdef __KERNEL_AVX2__ + float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h)); +#else + float8 r = {a, b, c, d, e, f, g, h}; +#endif + return r; +} + +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */ |