Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2018-02-14 13:23:30 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2018-03-13 16:29:48 +0300
commitfa6a1d06d1aecad33659393fc49ed8506213db28 (patch)
tree9cba5300d8fbce2fe200243f120b20abcdcc0864
parent81c199af8365c5a52bb7e8a1a4046acfceea1519 (diff)
Cycles: Add BVH8 and packeted triangle intersectioncycles_bvh8
This is an initial implementation of BVH8 optimization structure and packated triangle intersection. The aim is to get faster ray to scene intersection checks. Scene BVH4 BVH8 barbershop_interior 10:24.94 10:10.74 bmw27 02:41.25 02:38.83 classroom 08:16.49 07:56.15 fishy_cat 04:24.56 04:17.29 koro 06:03.06 06:01.45 pavillon_barcelona 09:21.26 09:02.98 victor 23:39.65 22:53.71 Original BVH8 patch from Anton Gavrikov. batched triangles intersection from Victoria Zhislina. Extra work and tests and fixes from Maxym Dmytrychenko.
-rw-r--r--intern/cycles/blender/addon/properties.py1
-rw-r--r--intern/cycles/bvh/CMakeLists.txt2
-rw-r--r--intern/cycles/bvh/bvh.cpp91
-rw-r--r--intern/cycles/bvh/bvh.h8
-rw-r--r--intern/cycles/bvh/bvh2.cpp11
-rw-r--r--intern/cycles/bvh/bvh4.cpp29
-rw-r--r--intern/cycles/bvh/bvh8.cpp515
-rw-r--r--intern/cycles/bvh/bvh8.h97
-rw-r--r--intern/cycles/bvh/bvh_node.cpp49
-rw-r--r--intern/cycles/bvh/bvh_node.h2
-rw-r--r--intern/cycles/device/device_cpu.cpp3
-rw-r--r--intern/cycles/kernel/CMakeLists.txt8
-rw-r--r--intern/cycles/kernel/bvh/bvh.h3
-rw-r--r--intern/cycles/kernel/bvh/bvh_local.h12
-rw-r--r--intern/cycles/kernel/bvh/bvh_shadow_all.h12
-rw-r--r--intern/cycles/kernel/bvh/bvh_traversal.h16
-rw-r--r--intern/cycles/kernel/bvh/bvh_types.h2
-rw-r--r--intern/cycles/kernel/bvh/bvh_volume.h10
-rw-r--r--intern/cycles/kernel/bvh/bvh_volume_all.h11
-rw-r--r--intern/cycles/kernel/bvh/obvh_local.h402
-rw-r--r--intern/cycles/kernel/bvh/obvh_nodes.h532
-rw-r--r--intern/cycles/kernel/bvh/obvh_shadow_all.h687
-rw-r--r--intern/cycles/kernel/bvh/obvh_traversal.h642
-rw-r--r--intern/cycles/kernel/bvh/obvh_volume.h483
-rw-r--r--intern/cycles/kernel/bvh/obvh_volume_all.h554
-rw-r--r--intern/cycles/kernel/bvh/qbvh_nodes.h3
-rw-r--r--intern/cycles/kernel/geom/geom_triangle_intersect.h478
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h9
-rw-r--r--intern/cycles/kernel/kernel_types.h1
-rw-r--r--intern/cycles/util/CMakeLists.txt5
-rw-r--r--intern/cycles/util/util_avxb.h191
-rw-r--r--intern/cycles/util/util_avxf.h51
-rw-r--r--intern/cycles/util/util_debug.cpp14
-rw-r--r--intern/cycles/util/util_types.h5
-rw-r--r--intern/cycles/util/util_types_float8.h70
-rw-r--r--intern/cycles/util/util_types_float8_impl.h113
36 files changed, 5060 insertions, 62 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 8dbd80f3747..a6995d9d750 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -50,6 +50,7 @@ enum_displacement_methods = (
enum_bvh_layouts = (
('BVH2', "BVH2", "", 1),
('BVH4', "BVH4", "", 2),
+ ('BVH8', "BVH8", "", 4),
)
enum_bvh_types = (
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index b8171e7f70d..fcd28572fdf 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -10,6 +10,7 @@ set(SRC
bvh.cpp
bvh2.cpp
bvh4.cpp
+ bvh8.cpp
bvh_binning.cpp
bvh_build.cpp
bvh_node.cpp
@@ -22,6 +23,7 @@ set(SRC_HEADERS
bvh.h
bvh2.h
bvh4.h
+ bvh8.h
bvh_binning.h
bvh_build.h
bvh_node.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index b524ca07d8d..bc73a3ad264 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -22,6 +22,7 @@
#include "bvh/bvh2.h"
#include "bvh/bvh4.h"
+#include "bvh/bvh8.h"
#include "bvh/bvh_build.h"
#include "bvh/bvh_node.h"
@@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout)
switch(layout) {
case BVH_LAYOUT_BVH2: return "BVH2";
case BVH_LAYOUT_BVH4: return "BVH4";
+ case BVH_LAYOUT_BVH8: return "BVH8";
case BVH_LAYOUT_NONE: return "NONE";
case BVH_LAYOUT_ALL: return "ALL";
}
@@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
return new BVH2(params, objects);
case BVH_LAYOUT_BVH4:
return new BVH4(params, objects);
+ case BVH_LAYOUT_BVH8:
+ return new BVH8(params, objects);
case BVH_LAYOUT_NONE:
case BVH_LAYOUT_ALL:
break;
@@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
}
}
visibility |= ob->visibility_for_tracing();
+
+ }
+}
+
+bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh)
+{
+ if(node->is_leaf()) {
+ return node->is_unaligned;
+ }
+ else {
+ return node_is_unaligned(node, bvh);
+ }
+}
+
+bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh)
+{
+ const BVHNode *node0 = node->get_child(0);
+ const BVHNode *node1 = node->get_child(1);
+
+ switch(bvh) {
+ case bvh2:
+ return node0->is_unaligned || node1->is_unaligned;
+ break;
+ case bvh4:
+ return leaf_check(node0, bvh2) || leaf_check(node1, bvh2);
+ break;
+ case bvh8:
+ return leaf_check(node0, bvh4) || leaf_check(node1, bvh4);
+ break;
+ default:
+ assert(0);
+ return false;
}
}
@@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
* BVH's are stored in global arrays. This function merges them into the
* top level BVH, adjusting indexes and offsets where appropriate.
*/
- /* TODO(sergey): This code needs adjustment for wider BVH than 4. */
const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
+ const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
/* Adjust primitive index to point to the triangle in the global array, for
* meshes with transform applied and already in the top level BVH.
@@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
size_t nsize, nsize_bbox;
if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
- nsize = use_qbvh
- ? BVH_UNALIGNED_QNODE_SIZE
- : BVH_UNALIGNED_NODE_SIZE;
- nsize_bbox = (use_qbvh)? 13: 0;
+ if(use_obvh) {
+ nsize = BVH_UNALIGNED_ONODE_SIZE;
+ nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1;
+ }
+ else {
+ nsize = use_qbvh
+ ? BVH_UNALIGNED_QNODE_SIZE
+ : BVH_UNALIGNED_NODE_SIZE;
+ nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0;
+ }
}
else {
- nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
- nsize_bbox = (use_qbvh)? 7: 0;
+ if(use_obvh) {
+ nsize = BVH_ONODE_SIZE;
+ nsize_bbox = BVH_ONODE_SIZE-1;
+ }
+ else {
+ nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+ nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0;
+ }
}
memcpy(pack_nodes + pack_nodes_offset,
@@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
/* Modify offsets into arrays */
int4 data = bvh_nodes[i + nsize_bbox];
-
- data.z += (data.z < 0)? -noffset_leaf: noffset;
- data.w += (data.w < 0)? -noffset_leaf: noffset;
-
- if(use_qbvh) {
- data.x += (data.x < 0)? -noffset_leaf: noffset;
- data.y += (data.y < 0)? -noffset_leaf: noffset;
+ int4 data1 = bvh_nodes[i + nsize_bbox-1];
+ if(use_obvh) {
+ data.z += (data.z < 0) ? -noffset_leaf : noffset;
+ data.w += (data.w < 0) ? -noffset_leaf : noffset;
+ data.x += (data.x < 0) ? -noffset_leaf : noffset;
+ data.y += (data.y < 0) ? -noffset_leaf : noffset;
+ data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
+ data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
+ data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
+ data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
+ }
+ else {
+ data.z += (data.z < 0) ? -noffset_leaf : noffset;
+ data.w += (data.w < 0) ? -noffset_leaf : noffset;
+ if(use_qbvh) {
+ data.x += (data.x < 0)? -noffset_leaf: noffset;
+ data.y += (data.y < 0)? -noffset_leaf: noffset;
+ }
}
-
pack_nodes[pack_nodes_offset + nsize_bbox] = data;
+ if(use_obvh) {
+ pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
+ }
/* Usually this copies nothing, but we better
* be prepared for possible node size extension.
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 6a82f915692..86be0bae4be 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -73,6 +73,12 @@ struct PackedBVH {
}
};
+enum BVH_TYPE {
+ bvh2,
+ bvh4,
+ bvh8
+};
+
/* BVH */
class BVH
@@ -93,6 +99,8 @@ protected:
/* Refit range of primitives. */
void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
+ static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh);
+ static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh);
/* triangles and strands */
void pack_primitives();
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
index 9d89d2b6afb..4a423c16559 100644
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -25,13 +25,6 @@
CCL_NAMESPACE_BEGIN
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
- const BVHNode *node0 = node->get_child(0),
- *node1 = node->get_child(1);
- return node0->is_unaligned || node1->is_unaligned;
-}
-
BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
: BVH(params_, objects_)
{
@@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root)
}
else {
stack.push_back(BVHStackEntry(root, nextNodeIdx));
- nextNodeIdx += node_bvh_is_unaligned(root)
+ nextNodeIdx += node_is_unaligned(root, bvh2)
? BVH_UNALIGNED_NODE_SIZE
: BVH_NODE_SIZE;
}
@@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root)
}
else {
idx[i] = nextNodeIdx;
- nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+ nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2)
? BVH_UNALIGNED_NODE_SIZE
: BVH_NODE_SIZE;
}
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
index 4faf47af7bb..a449587e607 100644
--- a/intern/cycles/bvh/bvh4.cpp
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN
* Perhaps we can merge nodes in actual tree and make our
* life easier all over the place.
*/
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
- const BVHNode *node0 = node->get_child(0),
- *node1 = node->get_child(1);
- bool has_unaligned = false;
- if(node0->is_leaf()) {
- has_unaligned |= node0->is_unaligned;
- }
- else {
- has_unaligned |= node0->get_child(0)->is_unaligned;
- has_unaligned |= node0->get_child(1)->is_unaligned;
- }
- if(node1->is_leaf()) {
- has_unaligned |= node1->is_unaligned;
- }
- else {
- has_unaligned |= node1->get_child(0)->is_unaligned;
- has_unaligned |= node1->get_child(1)->is_unaligned;
- }
- return has_unaligned;
-}
BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
: BVH(params_, objects_)
@@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root)
}
else {
stack.push_back(BVHStackEntry(root, nextNodeIdx));
- nextNodeIdx += node_qbvh_is_unaligned(root)
+ nextNodeIdx += node_is_unaligned(root, bvh4)
? BVH_UNALIGNED_QNODE_SIZE
: BVH_QNODE_SIZE;
}
@@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root)
}
else {
idx = nextNodeIdx;
- nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+ nextNodeIdx += node_is_unaligned(nodes[i], bvh4)
? BVH_UNALIGNED_QNODE_SIZE
: BVH_QNODE_SIZE;
}
@@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
visibility,
0.0f,
1.0f,
- 4);
+ num_nodes);
}
else {
pack_aligned_node(idx,
@@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
visibility,
0.0f,
1.0f,
- 4);
+ num_nodes);
}
}
}
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
new file mode 100644
index 00000000000..e8b6726602c
--- /dev/null
+++ b/intern/cycles/bvh/bvh8.cpp
@@ -0,0 +1,515 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "bvh/bvh8.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+ float4 data[BVH_ONODE_LEAF_SIZE];
+ memset(data, 0, sizeof(data));
+ if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+ /* object */
+ data[0].x = __int_as_float(~(leaf->lo));
+ data[0].y = __int_as_float(0);
+ }
+ else {
+ /* triangle */
+ data[0].x = __int_as_float(leaf->lo);
+ data[0].y = __int_as_float(leaf->hi);
+ }
+ data[0].z = __uint_as_float(leaf->visibility);
+ if(leaf->num_triangles() != 0) {
+ data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+ }
+
+ memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+}
+
+void BVH8::pack_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num)
+{
+ bool has_unaligned = false;
+ /* Check whether we have to create unaligned node or all nodes are aligned
+ * and we can cut some corner here.
+ */
+ if(params.use_unaligned_nodes) {
+ for(int i = 0; i < num; i++) {
+ if(en[i].node->is_unaligned) {
+ has_unaligned = true;
+ break;
+ }
+ }
+ }
+ if(has_unaligned) {
+ /* There's no unaligned children, pack into AABB node. */
+ pack_unaligned_inner(e, en, num);
+ }
+ else {
+ /* Create unaligned node with orientation transform for each of the
+ * children.
+ */
+ pack_aligned_inner(e, en, num);
+ }
+}
+
+void BVH8::pack_aligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num)
+{
+ BoundBox bounds[8];
+ int child[8];
+ for(int i = 0; i < num; ++i) {
+ bounds[i] = en[i].node->bounds;
+ child[i] = en[i].encodeIdx();
+ }
+ pack_aligned_node(e.idx,
+ bounds,
+ child,
+ e.node->visibility,
+ e.node->time_from,
+ e.node->time_to,
+ num);
+}
+
+void BVH8::pack_aligned_node(int idx,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num)
+{
+ float8 data[8];
+ memset(data, 0, sizeof(data));
+
+ data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+ data[0].b = time_from;
+ data[0].c = time_to;
+ for(int i = 0; i < num; i++) {
+ float3 bb_min = bounds[i].min;
+ float3 bb_max = bounds[i].max;
+
+ data[1][i] = bb_min.x;
+ data[2][i] = bb_max.x;
+ data[3][i] = bb_min.y;
+ data[4][i] = bb_max.y;
+ data[5][i] = bb_min.z;
+ data[6][i] = bb_max.z;
+
+ data[7][i] = __int_as_float(child[i]);
+ }
+
+ for(int i = num; i < 8; i++) {
+ /* We store BB which would never be recorded as intersection
+ * so kernel might safely assume there are always 4 child nodes.
+ */
+ data[1][i] = FLT_MAX;
+ data[2][i] = -FLT_MAX;
+
+ data[3][i] = FLT_MAX;
+ data[4][i] = -FLT_MAX;
+
+ data[5][i] = FLT_MAX;
+ data[6][i] = -FLT_MAX;
+
+ data[7][i] = __int_as_float(0);
+ }
+ memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
+}
+
+void BVH8::pack_unaligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num)
+{
+ Transform aligned_space[8];
+ BoundBox bounds[8];
+ int child[8];
+ for(int i = 0; i < num; ++i) {
+ aligned_space[i] = en[i].node->get_aligned_space();
+ bounds[i] = en[i].node->bounds;
+ child[i] = en[i].encodeIdx();
+ }
+ pack_unaligned_node(e.idx,
+ aligned_space,
+ bounds,
+ child,
+ e.node->visibility,
+ e.node->time_from,
+ e.node->time_to,
+ num);
+}
+
+void BVH8::pack_unaligned_node(int idx,
+ const Transform *aligned_space,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num)
+{
+ float8 data[BVH_UNALIGNED_ONODE_SIZE];
+ memset(data, 0, sizeof(data));
+ data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+ data[0].b = time_from;
+ data[0].c = time_to;
+
+ for(int i = 0; i < num; i++) {
+ Transform space = BVHUnaligned::compute_node_transform(
+ bounds[i],
+ aligned_space[i]);
+
+ data[1][i] = space.x.x;
+ data[2][i] = space.x.y;
+ data[3][i] = space.x.z;
+
+ data[4][i] = space.y.x;
+ data[5][i] = space.y.y;
+ data[6][i] = space.y.z;
+
+ data[7][i] = space.z.x;
+ data[8][i] = space.z.y;
+ data[9][i] = space.z.z;
+
+ data[10][i] = space.x.w;
+ data[11][i] = space.y.w;
+ data[12][i] = space.z.w;
+
+ data[13][i] = __int_as_float(child[i]);
+ }
+
+ for(int i = num; i < 8; i++) {
+ /* We store BB which would never be recorded as intersection
+ * so kernel might safely assume there are always 4 child nodes.
+ */
+
+ data[1][i] = 1.0f;
+ data[2][i] = 0.0f;
+ data[3][i] = 0.0f;
+
+ data[4][i] = 0.0f;
+ data[5][i] = 0.0f;
+ data[6][i] = 0.0f;
+
+ data[7][i] = 0.0f;
+ data[8][i] = 0.0f;
+ data[9][i] = 0.0f;
+
+ data[10][i] = -FLT_MAX;
+ data[11][i] = -FLT_MAX;
+ data[12][i] = -FLT_MAX;
+
+ data[13][i] = __int_as_float(0);
+ }
+
+ memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH8::pack_nodes(const BVHNode *root)
+{
+ /* Calculate size of the arrays required. */
+ const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT);
+ const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+ assert(num_leaf_nodes <= num_nodes);
+ const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+ size_t node_size;
+ if(params.use_unaligned_nodes) {
+ const size_t num_unaligned_nodes =
+ root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT);
+ node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
+ (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
+ }
+ else {
+ node_size = num_inner_nodes * BVH_ONODE_SIZE;
+ }
+ /* Resize arrays. */
+ pack.nodes.clear();
+ pack.leaf_nodes.clear();
+ /* For top level BVH, first merge existing BVH's so we know the offsets. */
+ if(params.top_level) {
+ pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+ }
+ else {
+ pack.nodes.resize(node_size);
+ pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+ }
+
+ int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+ vector<BVHStackEntry> stack;
+ stack.reserve(BVHParams::MAX_DEPTH*2);
+ if(root->is_leaf()) {
+ stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+ }
+ else {
+ stack.push_back(BVHStackEntry(root, nextNodeIdx));
+ nextNodeIdx += node_is_unaligned(root, bvh8)
+ ? BVH_UNALIGNED_ONODE_SIZE
+ : BVH_ONODE_SIZE;
+ }
+
+ while(stack.size()) {
+ BVHStackEntry e = stack.back();
+ stack.pop_back();
+
+ if(e.node->is_leaf()) {
+ /* leaf node */
+ const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+ pack_leaf(e, leaf);
+ }
+ else {
+ /* Inner node. */
+ const BVHNode *node = e.node;
+ const BVHNode *node0 = node->get_child(0);
+ const BVHNode *node1 = node->get_child(1);
+ /* Collect nodes. */
+ const BVHNode *nodes[8];
+ int numnodes = 0;
+ if(node0->is_leaf()) {
+ nodes[numnodes++] = node0;
+ }
+ else {
+ const BVHNode *node00 = node0->get_child(0),
+ *node01 = node0->get_child(1);
+ if(node00->is_leaf()) {
+ nodes[numnodes++] = node00;
+ }
+ else {
+ nodes[numnodes++] = node00->get_child(0);
+ nodes[numnodes++] = node00->get_child(1);
+ }
+ if(node01->is_leaf()) {
+ nodes[numnodes++] = node01;
+ }
+ else {
+ nodes[numnodes++] = node01->get_child(0);
+ nodes[numnodes++] = node01->get_child(1);
+ }
+ }
+ if(node1->is_leaf()) {
+ nodes[numnodes++] = node1;
+ }
+ else {
+ const BVHNode *node10 = node1->get_child(0),
+ *node11 = node1->get_child(1);
+ if(node10->is_leaf()) {
+ nodes[numnodes++] = node10;
+ }
+ else {
+ nodes[numnodes++] = node10->get_child(0);
+ nodes[numnodes++] = node10->get_child(1);
+ }
+ if(node11->is_leaf()) {
+ nodes[numnodes++] = node11;
+ }
+ else {
+ nodes[numnodes++] = node11->get_child(0);
+ nodes[numnodes++] = node11->get_child(1);
+ }
+ }
+ /* Push entries on the stack. */
+ for(int i = 0; i < numnodes; ++i) {
+ int idx;
+ if(nodes[i]->is_leaf()) {
+ idx = nextLeafNodeIdx++;
+ }
+ else {
+ idx = nextNodeIdx;
+ nextNodeIdx += node_is_unaligned(nodes[i], bvh8)
+ ? BVH_UNALIGNED_ONODE_SIZE
+ : BVH_ONODE_SIZE;
+ }
+ stack.push_back(BVHStackEntry(nodes[i], idx));
+ }
+ /* Set node. */
+ pack_inner(e, &stack[stack.size() - numnodes], numnodes);
+ }
+ }
+ assert(node_size == nextNodeIdx);
+ /* Root index to start traversal at, to handle case of single leaf node. */
+ pack.root_index = (root->is_leaf()) ? -1 : 0;
+}
+
+void BVH8::refit_nodes()
+{
+ assert(!params.top_level);
+
+ BoundBox bbox = BoundBox::empty;
+ uint visibility = 0;
+ refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+ if(leaf) {
+ int4 *data = &pack.leaf_nodes[idx];
+ int4 c = data[0];
+ /* Refit leaf node. */
+ for(int prim = c.x; prim < c.y; prim++) {
+ int pidx = pack.prim_index[prim];
+ int tob = pack.prim_object[prim];
+ Object *ob = objects[tob];
+
+ if(pidx == -1) {
+ /* Object instance. */
+ bbox.grow(ob->bounds);
+ }
+ else {
+ /* Primitives. */
+ const Mesh *mesh = ob->mesh;
+
+ if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+ /* Curves. */
+ int str_offset = (params.top_level) ? mesh->curve_offset : 0;
+ Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+ int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+ curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+ visibility |= PATH_RAY_CURVE;
+
+ /* Motion curves. */
+ if(mesh->use_motion_blur) {
+ Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+ if(attr) {
+ size_t mesh_size = mesh->curve_keys.size();
+ size_t steps = mesh->motion_steps - 1;
+ float3 *key_steps = attr->data_float3();
+
+ for(size_t i = 0; i < steps; i++) {
+ curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+ }
+ }
+ }
+ }
+ else {
+ /* Triangles. */
+ int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
+ Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+ const float3 *vpos = &mesh->verts[0];
+
+ triangle.bounds_grow(vpos, bbox);
+
+ /* Motion triangles. */
+ if(mesh->use_motion_blur) {
+ Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+ if(attr) {
+ size_t mesh_size = mesh->verts.size();
+ size_t steps = mesh->motion_steps - 1;
+ float3 *vert_steps = attr->data_float3();
+
+ for(size_t i = 0; i < steps; i++) {
+ triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+ }
+ }
+ }
+ }
+ }
+
+ visibility |= ob->visibility;
+ }
+
+ float4 leaf_data[BVH_ONODE_LEAF_SIZE];
+ leaf_data[0].x = __int_as_float(c.x);
+ leaf_data[0].y = __int_as_float(c.y);
+ leaf_data[0].z = __uint_as_float(visibility);
+ leaf_data[0].w = __uint_as_float(c.w);
+ memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+ }
+ else {
+ int4 *data = &pack.nodes[idx];
+ bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+ int4 c;
+ if(is_unaligned) {
+ c = data[BVH_UNALIGNED_ONODE_SIZE-1];
+ }
+ else {
+ c = data[BVH_ONODE_SIZE-1];
+ }
+ /* Refit inner node, set bbox from children. */
+ BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty,
+ BoundBox::empty, BoundBox::empty,
+ BoundBox::empty, BoundBox::empty,
+ BoundBox::empty, BoundBox::empty };
+ uint child_visibility[8] = { 0 };
+ int num_nodes = 0;
+
+ for(int i = 0; i < 8; ++i) {
+ if(c[i] != 0) {
+ refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+ child_bbox[i], child_visibility[i]);
+ ++num_nodes;
+ bbox.grow(child_bbox[i]);
+ visibility |= child_visibility[i];
+ }
+ }
+
+ if(is_unaligned) {
+ Transform aligned_space[8] = { transform_identity(), transform_identity(),
+ transform_identity(), transform_identity(),
+ transform_identity(), transform_identity(),
+ transform_identity(), transform_identity()};
+ pack_unaligned_node(idx,
+ aligned_space,
+ child_bbox,
+ &c[0],
+ visibility,
+ 0.0f,
+ 1.0f,
+ num_nodes);
+ }
+ else {
+ pack_aligned_node(idx,
+ child_bbox,
+ &c[0],
+ visibility,
+ 0.0f,
+ 1.0f,
+ num_nodes);
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
new file mode 100644
index 00000000000..99ff79e60b5
--- /dev/null
+++ b/intern/cycles/bvh/bvh8.h
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __BVH8_H__
+#define __BVH8_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_ONODE_SIZE 16
+#define BVH_ONODE_LEAF_SIZE 1
+#define BVH_UNALIGNED_ONODE_SIZE 28
+
+/* BVH8
+*
+* Octo BVH, with each node having eight children, to use with SIMD instructions.
+*/
+class BVH8 : public BVH {
+protected:
+ /* constructor */
+ friend class BVH;
+ BVH8(const BVHParams& params, const vector<Object*>& objects);
+
+ /* pack */
+ void pack_nodes(const BVHNode *root);
+
+ void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+ void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+ void pack_aligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num);
+ void pack_aligned_node(int idx,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num);
+
+ void pack_unaligned_inner(const BVHStackEntry& e,
+ const BVHStackEntry *en,
+ int num);
+ void pack_unaligned_node(int idx,
+ const Transform *aligned_space,
+ const BoundBox *bounds,
+ const int *child,
+ const uint visibility,
+ const float time_from,
+ const float time_to,
+ const int num);
+
+ /* refit */
+ void refit_nodes();
+ void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH8_H__ */
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 24af919ff46..8749df88689 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
}
}
return cnt;
+ case BVH_STAT_ONODE_COUNT:
+ cnt = 1;
+ for(int i = 0; i < num_children(); i++) {
+ BVHNode *node = get_child(i);
+ if(node->is_leaf()) {
+ cnt += 1;
+ }
+ else {
+ for(int j = 0; j < node->num_children(); j++)
+ {
+ BVHNode *node_next = node->get_child(j);
+ if(node_next->is_leaf()) {
+ cnt += 1;
+ }
+ else {
+ for(int k = 0; k < node_next->num_children(); k++) {
+ cnt += node_next->get_child(k)->getSubtreeSize(stat);
+ }
+ }
+ }
+ }
+ }
+ return cnt;
+ case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT:
+ {
+ bool has_unaligned = false;
+ for(int i = 0; i < num_children(); i++) {
+ BVHNode *node = get_child(i);
+ if(node->is_leaf()) {
+ has_unaligned |= node->is_unaligned;
+ }
+ else {
+ for(int j = 0; j < node->num_children(); j++) {
+ BVHNode *node_next = node->get_child(j);
+ if(node_next->is_leaf()) {
+ has_unaligned |= node_next->is_unaligned;
+ }
+ else {
+ for(int k = 0; k < node_next->num_children(); k++) {
+ cnt += node_next->get_child(k)->getSubtreeSize(stat);
+ has_unaligned |= node_next->get_child(k)->is_unaligned;
+ }
+ }
+ }
+ }
+ }
+ cnt += has_unaligned? 1: 0;
+ }
+ return cnt;
case BVH_STAT_ALIGNED_COUNT:
if(!is_unaligned) {
cnt = 1;
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 94cf5ab730c..ed89d52a50a 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -39,6 +39,8 @@ enum BVH_STAT {
BVH_STAT_ALIGNED_LEAF_COUNT,
BVH_STAT_UNALIGNED_LEAF_COUNT,
BVH_STAT_DEPTH,
+ BVH_STAT_ONODE_COUNT,
+ BVH_STAT_UNALIGNED_INNER_ONODE_COUNT,
};
class BVHParams;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 6be60f8bbb6..cc37ba33252 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -1050,6 +1050,9 @@ void device_cpu_info(vector<DeviceInfo>& devices)
if (system_cpu_support_sse2()) {
info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
}
+ if (system_cpu_support_avx2()) {
+ info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
+ }
info.has_volume_decoupled = true;
info.has_osl = true;
info.has_half_images = true;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9b7f4e00084..adbfcabc86c 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS
bvh/qbvh_traversal.h
bvh/qbvh_volume.h
bvh/qbvh_volume_all.h
+ bvh/obvh_nodes.h
+ bvh/obvh_shadow_all.h
+ bvh/obvh_local.h
+ bvh/obvh_traversal.h
+ bvh/obvh_volume.h
+ bvh/obvh_volume_all.h
)
set(SRC_HEADERS
@@ -266,6 +272,8 @@ set(SRC_UTIL_HEADERS
../util/util_types_float3_impl.h
../util/util_types_float4.h
../util/util_types_float4_impl.h
+ ../util/util_types_float8.h
+ ../util/util_types_float8_impl.h
../util/util_types_int2.h
../util/util_types_int2_impl.h
../util/util_types_int3.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index d3e0b25a200..81100ecd44d 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN
/* Common QBVH functions. */
#ifdef __QBVH__
# include "kernel/bvh/qbvh_nodes.h"
+#ifdef __KERNEL_AVX2__
+# include "kernel/bvh/obvh_nodes.h"
+#endif
#endif
/* Regular BVH traversal */
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 9292cc76a5c..5a150fa52d5 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_local.h"
+# ifdef __KERNEL_AVX2__
+# include "kernel/bvh/obvh_local.h"
+# endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@@ -244,6 +247,15 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
int max_hits)
{
switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+ case BVH_LAYOUT_BVH8:
+ return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+ ray,
+ local_isect,
+ local_object,
+ lcg_state,
+ max_hits);
+#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index cfc567ff9ca..d525b29fd94 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_shadow_all.h"
+#ifdef __KERNEL_AVX2__
+# include "kernel/bvh/obvh_shadow_all.h"
+#endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
uint *num_hits)
{
switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+ case BVH_LAYOUT_BVH8:
+ return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+ ray,
+ isect_array,
+ visibility,
+ max_hits,
+ num_hits);
+#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 551625eae78..e95d2408201 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -20,6 +20,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_traversal.h"
#endif
+#ifdef __KERNEL_AVX2__
+# include "kernel/bvh/obvh_traversal.h"
+#endif
#if BVH_FEATURE(BVH_HAIR)
# define NODE_INTERSECT bvh_node_intersect
@@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
)
{
switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+ case BVH_LAYOUT_BVH8:
+ return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+ ray,
+ isect,
+ visibility
+# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ , lcg_state,
+ difl,
+ extmax
+# endif
+ );
+#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index ead424aaaaf..4ca0dc2225e 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
#define BVH_STACK_SIZE 192
#define BVH_QSTACK_SIZE 384
-
+#define BVH_OSTACK_SIZE 768
/* BVH intersection function variations */
#define BVH_INSTANCING 1
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index ce5fc7be33d..7d03855cb8f 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_volume.h"
+#ifdef __KERNEL_AVX2__
+# include "kernel/bvh/obvh_volume.h"
+#endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
const uint visibility)
{
switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+ case BVH_LAYOUT_BVH8:
+ return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+ ray,
+ isect,
+ visibility);
+#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index f2379efc656..8df3122c044 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -19,6 +19,9 @@
#ifdef __QBVH__
# include "kernel/bvh/qbvh_volume_all.h"
+#ifdef __KERNEL_AVX2__
+# include "kernel/bvh/obvh_volume_all.h"
+#endif
#endif
#if BVH_FEATURE(BVH_HAIR)
@@ -382,6 +385,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
const uint visibility)
{
switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+ case BVH_LAYOUT_BVH8:
+ return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+ ray,
+ isect_array,
+ max_hits,
+ visibility);
+#endif
#ifdef __QBVH__
case BVH_LAYOUT_BVH4:
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
new file mode 100644
index 00000000000..b61340ac7a8
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_local.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT obvh_node_intersect
+#else
+# define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device void BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ LocalIntersection *local_isect,
+ int local_object,
+ uint *lcg_state,
+ int max_hits)
+{
+ /* Traversal stack in CUDA thread-local memory. */
+ OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_tex_fetch(__object_node, local_object);
+
+ /* Ray parameters in registers. */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = ray->t;
+
+ local_isect->num_hits = 0;
+
+ const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+ if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+ isect_t = bvh_instance_motion_push(kg,
+ local_object,
+ ray,
+ &P,
+ &dir,
+ &idir,
+ isect_t,
+ &ob_itfm);
+#else
+ isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+#endif
+ object = local_object;
+ }
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return;
+ }
+#endif
+
+ avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+ avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ avxf dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+
+ /* Five children are hit, push all onto stack and sort 5
+ * stack items, continue with closest child
+ */
+ r = __bscf(child_mask);
+ int c4 = __float_as_int(cnodes[r]);
+ float d4 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+ /* Six children are hit, push all onto stack and sort 6
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c5 = __float_as_int(cnodes[r]);
+ float d5 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+
+ /* Seven children are hit, push all onto stack and sort 7
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c6 = __float_as_int(cnodes[r]);
+ float d6 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+ /* Eight children are hit, push all onto stack and sort 8
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c7 = __float_as_int(cnodes[r]);
+ float d7 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c7;
+ traversal_stack[stack_ptr].dist = d7;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6],
+ &traversal_stack[stack_ptr - 7]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+ int prim_addr = __float_as_int(leaf.x);
+
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ /* Intersect ray against primitive, */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ triangle_intersect_local(kg,
+ local_isect,
+ P,
+ dir,
+ object,
+ local_object,
+ prim_addr,
+ isect_t,
+ lcg_state,
+ max_hits);
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ /* Intersect ray against primitive. */
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ motion_triangle_intersect_local(kg,
+ local_isect,
+ P,
+ dir,
+ ray->time,
+ object,
+ local_object,
+ prim_addr,
+ isect_t,
+ lcg_state,
+ max_hits);
+ }
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+ }
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h
new file mode 100644
index 00000000000..93f35f6dffb
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_nodes.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Aligned nodes intersection AVX code is adopted from Embree,
+ */
+
+struct OBVHStackItem {
+ int addr;
+ float dist;
+};
+
+ccl_device_inline void obvh_near_far_idx_calc(const float3& idir,
+ int *ccl_restrict near_x,
+ int *ccl_restrict near_y,
+ int *ccl_restrict near_z,
+ int *ccl_restrict far_x,
+ int *ccl_restrict far_y,
+ int *ccl_restrict far_z)
+
+{
+#ifdef __KERNEL_SSE__
+ *near_x = 0; *far_x = 1;
+ *near_y = 2; *far_y = 3;
+ *near_z = 4; *far_z = 5;
+
+ const size_t mask = movemask(ssef(idir.m128));
+
+ const int mask_x = mask & 1;
+ const int mask_y = (mask & 2) >> 1;
+ const int mask_z = (mask & 4) >> 2;
+
+ *near_x += mask_x; *far_x -= mask_x;
+ *near_y += mask_y; *far_y -= mask_y;
+ *near_z += mask_z; *far_z -= mask_z;
+#else
+ if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
+ if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
+ if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
+#endif
+}
+
+ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a,
+ OBVHStackItem *ccl_restrict b)
+{
+ OBVHStackItem tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+ OBVHStackItem *ccl_restrict s2,
+ OBVHStackItem *ccl_restrict s3)
+{
+ if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+ if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+ if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+ OBVHStackItem *ccl_restrict s2,
+ OBVHStackItem *ccl_restrict s3,
+ OBVHStackItem *ccl_restrict s4)
+{
+ if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+ if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); }
+ if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); }
+ if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); }
+ if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+ OBVHStackItem *ccl_restrict s2,
+ OBVHStackItem *ccl_restrict s3,
+ OBVHStackItem *ccl_restrict s4,
+ OBVHStackItem *ccl_restrict s5)
+{
+ obvh_stack_sort(s1, s2, s3, s4);
+ if(s5->dist < s4->dist) {
+ obvh_item_swap(s4, s5);
+ if(s4->dist < s3->dist) {
+ obvh_item_swap(s3, s4);
+ if(s3->dist < s2->dist) {
+ obvh_item_swap(s2, s3);
+ if(s2->dist < s1->dist) {
+ obvh_item_swap(s1, s2);
+ }
+ }
+ }
+ }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+ OBVHStackItem *ccl_restrict s2,
+ OBVHStackItem *ccl_restrict s3,
+ OBVHStackItem *ccl_restrict s4,
+ OBVHStackItem *ccl_restrict s5,
+ OBVHStackItem *ccl_restrict s6)
+{
+ obvh_stack_sort(s1, s2, s3, s4, s5);
+ if(s6->dist < s5->dist) {
+ obvh_item_swap(s5, s6);
+ if(s5->dist < s4->dist) {
+ obvh_item_swap(s4, s5);
+ if(s4->dist < s3->dist) {
+ obvh_item_swap(s3, s4);
+ if(s3->dist < s2->dist) {
+ obvh_item_swap(s2, s3);
+ if(s2->dist < s1->dist) {
+ obvh_item_swap(s1, s2);
+ }
+ }
+ }
+ }
+ }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+ OBVHStackItem *ccl_restrict s2,
+ OBVHStackItem *ccl_restrict s3,
+ OBVHStackItem *ccl_restrict s4,
+ OBVHStackItem *ccl_restrict s5,
+ OBVHStackItem *ccl_restrict s6,
+ OBVHStackItem *ccl_restrict s7)
+{
+ obvh_stack_sort(s1, s2, s3, s4, s5, s6);
+ if(s7->dist < s6->dist) {
+ obvh_item_swap(s6, s7);
+ if(s6->dist < s5->dist) {
+ obvh_item_swap(s5, s6);
+ if(s5->dist < s4->dist) {
+ obvh_item_swap(s4, s5);
+ if(s4->dist < s3->dist) {
+ obvh_item_swap(s3, s4);
+ if(s3->dist < s2->dist) {
+ obvh_item_swap(s2, s3);
+ if(s2->dist < s1->dist) {
+ obvh_item_swap(s1, s2);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+ OBVHStackItem *ccl_restrict s2,
+ OBVHStackItem *ccl_restrict s3,
+ OBVHStackItem *ccl_restrict s4,
+ OBVHStackItem *ccl_restrict s5,
+ OBVHStackItem *ccl_restrict s6,
+ OBVHStackItem *ccl_restrict s7,
+ OBVHStackItem *ccl_restrict s8)
+{
+ obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
+ if(s8->dist < s7->dist) {
+ obvh_item_swap(s7, s8);
+ if(s7->dist < s6->dist) {
+ obvh_item_swap(s6, s7);
+ if(s6->dist < s5->dist) {
+ obvh_item_swap(s5, s6);
+ if(s5->dist < s4->dist) {
+ obvh_item_swap(s4, s5);
+ if(s4->dist < s3->dist) {
+ obvh_item_swap(s3, s4);
+ if(s3->dist < s2->dist) {
+ obvh_item_swap(s2, s3);
+ if(s2->dist < s1->dist) {
+ obvh_item_swap(s1, s2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+ const avxf& isect_near,
+ const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+ const avx3f& org_idir,
+#else
+ const avx3f& org,
+#endif
+ const avx3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ avxf *ccl_restrict dist)
+{
+ const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+ const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x);
+ const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y);
+ const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z);
+ const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x);
+ const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y);
+ const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z);
+
+ const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+ const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const avxb vmask = tnear <= tfar;
+ int mask = (int)movemask(vmask);
+ *dist = tnear;
+ return mask;
+#else
+ return 0;
+#endif
+}
+
+ccl_device_inline int obvh_aligned_node_intersect_robust(
+ KernelGlobals *ccl_restrict kg,
+ const avxf& isect_near,
+ const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+ const avx3f& P_idir,
+#else
+ const avx3f& P,
+#endif
+ const avx3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ const float difl,
+ avxf *ccl_restrict dist)
+{
+ const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+ const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
+ const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
+ const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
+ const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
+ const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
+ const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
+
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+ const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+ const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const avxb vmask = round_down*tnear <= round_up*tfar;
+ int mask = (int)movemask(vmask);
+ *dist = tnear;
+ return mask;
+#else
+ return 0;
+#endif
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int obvh_unaligned_node_intersect(
+ KernelGlobals *ccl_restrict kg,
+ const avxf& isect_near,
+ const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+ const avx3f& org_idir,
+#endif
+ const avx3f& org,
+ const avx3f& dir,
+ const avx3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ avxf *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+ const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+ const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+ const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+ const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+ const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+ const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+ const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+ const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+ const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+ const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+ const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+ const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+ aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+ aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+ const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+ aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+ aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+ const avxf neg_one(-1.0f);
+ const avxf nrdir_x = neg_one / aligned_dir_x,
+ nrdir_y = neg_one / aligned_dir_y,
+ nrdir_z = neg_one / aligned_dir_z;
+
+ const avxf tlower_x = aligned_P_x * nrdir_x,
+ tlower_y = aligned_P_y * nrdir_y,
+ tlower_z = aligned_P_z * nrdir_z;
+
+ const avxf tupper_x = tlower_x - nrdir_x,
+ tupper_y = tlower_y - nrdir_y,
+ tupper_z = tlower_z - nrdir_z;
+
+ const avxf tnear_x = min(tlower_x, tupper_x);
+ const avxf tnear_y = min(tlower_y, tupper_y);
+ const avxf tnear_z = min(tlower_z, tupper_z);
+ const avxf tfar_x = max(tlower_x, tupper_x);
+ const avxf tfar_y = max(tlower_y, tupper_y);
+ const avxf tfar_z = max(tlower_z, tupper_z);
+ const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+ const avxb vmask = tnear <= tfar;
+ *dist = tnear;
+ return movemask(vmask);
+}
+
+ccl_device_inline int obvh_unaligned_node_intersect_robust(
+ KernelGlobals *ccl_restrict kg,
+ const avxf& isect_near,
+ const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+ const avx3f& P_idir,
+#endif
+ const avx3f& P,
+ const avx3f& dir,
+ const avx3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ const float difl,
+ avxf *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+ const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+ const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+ const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+ const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+ const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+ const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+ const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+ const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+ const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+ const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+ const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+ const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+ aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+ aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+ const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+ aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+ aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+ const avxf neg_one(-1.0f);
+ const avxf nrdir_x = neg_one / aligned_dir_x,
+ nrdir_y = neg_one / aligned_dir_y,
+ nrdir_z = neg_one / aligned_dir_z;
+
+ const avxf tlower_x = aligned_P_x * nrdir_x,
+ tlower_y = aligned_P_y * nrdir_y,
+ tlower_z = aligned_P_z * nrdir_z;
+
+ const avxf tupper_x = tlower_x - nrdir_x,
+ tupper_y = tlower_y - nrdir_y,
+ tupper_z = tlower_z - nrdir_z;
+
+ const float round_down = 1.0f - difl;
+ const float round_up = 1.0f + difl;
+
+ const avxf tnear_x = min(tlower_x, tupper_x);
+ const avxf tnear_y = min(tlower_y, tupper_y);
+ const avxf tnear_z = min(tlower_z, tupper_z);
+ const avxf tfar_x = max(tlower_x, tupper_x);
+ const avxf tfar_y = max(tlower_y, tupper_y);
+ const avxf tfar_z = max(tlower_z, tupper_z);
+
+ const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+ const avxb vmask = round_down*tnear <= round_up*tfar;
+ *dist = tnear;
+ return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int obvh_node_intersect(
+ KernelGlobals *ccl_restrict kg,
+ const avxf& isect_near,
+ const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+ const avx3f& org_idir,
+#endif
+ const avx3f& org,
+ const avx3f& dir,
+ const avx3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ avxf *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return obvh_unaligned_node_intersect(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ org_idir,
+#endif
+ org,
+ dir,
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ dist);
+ }
+ else {
+ return obvh_aligned_node_intersect(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ org_idir,
+#else
+ org,
+#endif
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ dist);
+ }
+}
+
+ccl_device_inline int obvh_node_intersect_robust(
+ KernelGlobals *ccl_restrict kg,
+ const avxf& isect_near,
+ const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+ const avx3f& P_idir,
+#endif
+ const avx3f& P,
+ const avx3f& dir,
+ const avx3f& idir,
+ const int near_x,
+ const int near_y,
+ const int near_z,
+ const int far_x,
+ const int far_y,
+ const int far_z,
+ const int node_addr,
+ const float difl,
+ avxf *ccl_restrict dist)
+{
+ const int offset = node_addr;
+ const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+ if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+ return obvh_unaligned_node_intersect_robust(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ P_idir,
+#endif
+ P,
+ dir,
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ difl,
+ dist);
+ }
+ else {
+ return obvh_aligned_node_intersect_robust(kg,
+ isect_near,
+ isect_far,
+#ifdef __KERNEL_AVX2__
+ P_idir,
+#else
+ P,
+#endif
+ idir,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ difl,
+ dist);
+ }
+}
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
new file mode 100644
index 00000000000..3e877065127
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT obvh_node_intersect
+#else
+# define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const int skip_object,
+ const uint max_hits,
+ uint *num_hits)
+{
+ /* TODO(sergey):
+ * - Test if pushing distance on the stack helps.
+ * - Likely and unlikely for if() statements.
+ * - Test restrict attribute for pointers.
+ */
+
+ /* Traversal stack in CUDA thread-local memory. */
+ OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* Ray parameters in registers. */
+ const float tmax = ray->t;
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+ *num_hits = 0;
+ isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int num_hits_in_instance = 0;
+#endif
+
+ avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+ avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ (void)inodes;
+
+ if(false
+#ifdef __VISIBILITY_FLAG__
+ || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+#endif
+#if BVH_FEATURE(BVH_MOTION)
+ || UNLIKELY(ray->time < inodes.y)
+ || UNLIKELY(ray->time > inodes.z)
+#endif
+ ) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ avxf dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+//#if !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+
+ /* Five children are hit, push all onto stack and sort 5
+ * stack items, continue with closest child
+ */
+ r = __bscf(child_mask);
+ int c4 = __float_as_int(cnodes[r]);
+ float d4 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Six children are hit, push all onto stack and sort 6
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c5 = __float_as_int(cnodes[r]);
+ float d5 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+
+ /* Seven children are hit, push all onto stack and sort 7
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c6 = __float_as_int(cnodes[r]);
+ float d6 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Eight children are hit, push all onto stack and sort 8
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c7 = __float_as_int(cnodes[r]);
+ float d7 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c7;
+ traversal_stack[stack_ptr].dist = d7;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6],
+ &traversal_stack[stack_ptr - 7]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ if(p_type == PRIMITIVE_TRIANGLE) {
+ int prim_count = prim_addr2 - prim_addr;
+ if(prim_count < 3) {
+ while(prim_addr < prim_addr2) {
+ kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+ int hit = triangle_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr);
+ /* Shadow ray early termination. */
+ if(hit) {
+ /* detect if this surface has a shader with transparent shadows */
+
+ /* todo: optimize so primitive visibility flag indicates if
+ * the primitive has a transparent shadow shader? */
+ int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+ int shader = 0;
+
+#ifdef __HAIR__
+ if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+ {
+ shader = kernel_tex_fetch(__tri_shader, prim);
+ }
+#ifdef __HAIR__
+ else {
+ float4 str = kernel_tex_fetch(__curves, prim);
+ shader = __float_as_int(str.z);
+ }
+#endif
+ int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+ /* if no transparent shadows, all light is blocked */
+ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+ return true;
+ }
+ /* if maximum number of hits reached, block all light */
+ else if(*num_hits == max_hits) {
+ return true;
+ }
+
+ /* move on to next entry in intersections array */
+ isect_array++;
+ (*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+
+ isect_array->t = isect_t;
+ }
+
+ prim_addr++;
+ } //while
+ } else {
+ kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int* nhiptr = &num_hits_in_instance;
+#else
+ int nhi= 0;
+ int *nhiptr = &nhi;
+#endif
+
+ int result = triangle_intersect8(kg,
+ &isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ prim_count,
+ num_hits,
+ max_hits,
+ nhiptr,
+ isect_t);
+ if(result == 2) {
+ return true;
+ }
+ } // prim_count
+ } // PRIMITIVE_TRIANGLE
+ else {
+ while(prim_addr < prim_addr2) {
+ kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+
+#ifdef __SHADOW_TRICKS__
+ uint tri_object = (object == OBJECT_NONE)
+ ? kernel_tex_fetch(__prim_object, prim_addr)
+ : object;
+ if(tri_object == skip_object) {
+ ++prim_addr;
+ continue;
+ }
+#endif
+
+ bool hit;
+
+ /* todo: specialized intersect functions which don't fill in
+ * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+ * might give a few % performance improvement */
+
+ switch(p_type) {
+
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ hit = motion_triangle_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ ray->time,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr);
+ break;
+ }
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+ hit = cardinal_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ NULL,
+ 0, 0);
+ }
+ else {
+ hit = curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ PATH_RAY_SHADOW,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ NULL,
+ 0, 0);
+ }
+ break;
+ }
+#endif
+ default: {
+ hit = false;
+ break;
+ }
+ }
+
+ /* Shadow ray early termination. */
+ if(hit) {
+ /* detect if this surface has a shader with transparent shadows */
+
+ /* todo: optimize so primitive visibility flag indicates if
+ * the primitive has a transparent shadow shader? */
+ int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+ int shader = 0;
+
+#ifdef __HAIR__
+ if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+ {
+ shader = kernel_tex_fetch(__tri_shader, prim);
+ }
+#ifdef __HAIR__
+ else {
+ float4 str = kernel_tex_fetch(__curves, prim);
+ shader = __float_as_int(str.z);
+ }
+#endif
+ int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+ /* if no transparent shadows, all light is blocked */
+ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+ return true;
+ }
+ /* if maximum number of hits reached, block all light */
+ else if(*num_hits == max_hits) {
+ return true;
+ }
+
+ /* move on to next entry in intersections array */
+ isect_array++;
+ (*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+
+ isect_array->t = isect_t;
+ }
+
+ prim_addr++;
+ }//while prim
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+# if BVH_FEATURE(BVH_MOTION)
+ isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+# else
+ isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+# endif
+
+ num_hits_in_instance = 0;
+ isect_array->t = isect_t;
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+ if(num_hits_in_instance) {
+ float t_fac;
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+# else
+ bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+# endif
+ /* Scale isect->t to adjust for instancing. */
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+ }
+ else {
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+# endif
+ }
+
+ isect_t = tmax;
+ isect_array->t = isect_t;
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
new file mode 100644
index 00000000000..2021d8e1143
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_traversal.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT obvh_node_intersect
+# define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
+#else
+# define NODE_INTERSECT obvh_aligned_node_intersect
+# define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ ,uint *lcg_state,
+ float difl,
+ float extmax
+#endif
+ )
+{
+ /* Traversal stack in CUDA thread-local memory. */
+ OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+ traversal_stack[0].dist = -FLT_MAX;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+ float node_dist = -FLT_MAX;
+
+ /* Ray parameters in registers. */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+ BVH_DEBUG_INIT();
+ avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+ avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+ avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+ (void)inodes;
+
+ if(UNLIKELY(node_dist > isect->t)
+#if BVH_FEATURE(BVH_MOTION)
+ || UNLIKELY(ray->time < inodes.y)
+ || UNLIKELY(ray->time > inodes.z)
+#endif
+#ifdef __VISIBILITY_FLAG__
+ || (__float_as_uint(inodes.x) & visibility) == 0
+#endif
+ )
+ {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ int child_mask;
+ avxf dist;
+
+ BVH_DEBUG_NEXT_NODE();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+ if(difl != 0.0f) {
+ /* NOTE: We extend all the child BB instead of fetching
+ * and checking visibility flags for each of the,
+ *
+ * Need to test if doing opposite would be any faster.
+ */
+ child_mask = NODE_INTERSECT_ROBUST(kg,
+ tnear,
+ tfar,
+# ifdef __KERNEL_AVX2__
+ P_idir4,
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+# endif
+# if BVH_FEATURE(BVH_HAIR)
+ dir4,
+# endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ difl,
+ &dist);
+ }
+ else
+#endif /* BVH_HAIR_MINIMUM_WIDTH */
+ {
+ child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+ }
+
+ if(child_mask != 0) {
+ avxf cnodes;
+ /* TODO(sergey): Investigate whether moving cnodes upwards
+ * gives a speedup (will be different cache pattern but will
+ * avoid extra check here),
+ */
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ float d0 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ node_dist = d0;
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ node_dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ node_dist = d0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+
+ /* Five children are hit, push all onto stack and sort 5
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c4 = __float_as_int(cnodes[r]);
+ float d4 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Six children are hit, push all onto stack and sort 6
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c5 = __float_as_int(cnodes[r]);
+ float d5 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+
+ /* Seven children are hit, push all onto stack and sort 7
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c6 = __float_as_int(cnodes[r]);
+ float d6 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Eight children are hit, push all onto stack and sort 8
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c7 = __float_as_int(cnodes[r]);
+ float d7 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c7;
+ traversal_stack[stack_ptr].dist = d7;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6],
+ &traversal_stack[stack_ptr - 7]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+#ifdef __VISIBILITY_FLAG__
+ if(UNLIKELY((node_dist > isect->t) ||
+ ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+ if(UNLIKELY((node_dist > isect->t)))
+#endif
+ {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ continue;
+ }
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ int prim_count = prim_addr2 - prim_addr;
+ if(prim_count < 3) {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_INTERSECTION();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(triangle_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr))
+ {
+ tfar = avxf(isect->t);
+ /* Shadow ray early termination. */
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }//for
+ }
+ else {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(triangle_intersect8(kg,
+ &isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ prim_count,
+ 0,
+ 0,
+ NULL,
+ 0.0f))
+ {
+ tfar = avxf(isect->t);
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }//prim count
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_INTERSECTION();
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ if(motion_triangle_intersect(kg,
+ isect,
+ P,
+ dir,
+ ray->time,
+ visibility,
+ object,
+ prim_addr))
+ {
+ tfar = avxf(isect->t);
+ /* Shadow ray early termination. */
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }
+ break;
+ }
+#endif /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ BVH_DEBUG_NEXT_INTERSECTION();
+ const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+ kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
+ bool hit;
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+ hit = cardinal_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ lcg_state,
+ difl,
+ extmax);
+ }
+ else {
+ hit = curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ lcg_state,
+ difl,
+ extmax);
+ }
+ if(hit) {
+ tfar = avxf(isect->t);
+ /* Shadow ray early termination. */
+ if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ return true;
+ }
+ }
+ }
+ break;
+ }
+#endif /* BVH_FEATURE(BVH_HAIR) */
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+# if BVH_FEATURE(BVH_MOTION)
+ qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
+# else
+ qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
+# endif
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+ traversal_stack[stack_ptr].dist = -FLT_MAX;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+
+ BVH_DEBUG_NEXT_INSTANCE();
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+# if BVH_FEATURE(BVH_MOTION)
+ isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+# else
+ isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+# endif
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ node_dist = traversal_stack[stack_ptr].dist;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
new file mode 100644
index 00000000000..da9ddbd4f24
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_volume.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT obvh_node_intersect
+#else
+# define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility)
+{
+ /* Traversal stack in CUDA thread-local memory. */
+ OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* Ray parameters in registers. */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return false;
+ }
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+ avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+ avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+ avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(inodes.x) & visibility) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ avxf dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+
+ /* Five children are hit, push all onto stack and sort 5
+ * stack items, continue with closest child
+ */
+ r = __bscf(child_mask);
+ int c4 = __float_as_int(cnodes[r]);
+ float d4 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Six children are hit, push all onto stack and sort 6
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c5 = __float_as_int(cnodes[r]);
+ float d5 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+
+ /* Seven children are hit, push all onto stack and sort 7
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c6 = __float_as_int(cnodes[r]);
+ float d6 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Eight children are hit, push all onto stack and sort 8
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c7 = __float_as_int(cnodes[r]);
+ float d7 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c7;
+ traversal_stack[stack_ptr].dist = d7;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6],
+ &traversal_stack[stack_ptr - 7]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+ if((__float_as_uint(leaf.z) & visibility) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(p_type) {
+ case PRIMITIVE_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+ }
+ break;
+ }
+#endif
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+# if BVH_FEATURE(BVH_MOTION)
+ isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+# else
+ isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+# endif
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* Pop. */
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+# if BVH_FEATURE(BVH_MOTION)
+ isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+# else
+ isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+# endif
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect->t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
new file mode 100644
index 00000000000..a88573e6f86
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_volume_all.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+# define NODE_INTERSECT obvh_node_intersect
+#else
+# define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect_array,
+ const uint max_hits,
+ const uint visibility)
+{
+ /* Traversal stack in CUDA thread-local memory. */
+ OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+ traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+ /* Traversal variables in registers. */
+ int stack_ptr = 0;
+ int node_addr = kernel_data.bvh.root;
+
+ /* Ray parameters in registers. */
+ const float tmax = ray->t;
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+ float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+ Transform ob_itfm;
+#endif
+
+ uint num_hits = 0;
+ isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+ if(!isfinite(P.x)) {
+ return 0;
+ }
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ int num_hits_in_instance = 0;
+#endif
+
+ avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+ avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+ avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+ float3 P_idir = P*idir;
+ avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+ /* Offsets to select the side that becomes the lower or upper bound. */
+ int near_x, near_y, near_z;
+ int far_x, far_y, far_z;
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+
+ /* Traversal loop. */
+ do {
+ do {
+ /* Traverse internal nodes. */
+ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+ float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+ if((__float_as_uint(inodes.x) & visibility) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+#endif
+
+ avxf dist;
+ int child_mask = NODE_INTERSECT(kg,
+ tnear,
+ tfar,
+#ifdef __KERNEL_AVX2__
+ P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+ dir4,
+#endif
+ idir4,
+ near_x, near_y, near_z,
+ far_x, far_y, far_z,
+ node_addr,
+ &dist);
+
+ if(child_mask != 0) {
+ avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+ if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+ }
+ else
+#endif
+ {
+ cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+ }
+
+ /* One child is hit, continue with that child. */
+ int r = __bscf(child_mask);
+ if(child_mask == 0) {
+ node_addr = __float_as_int(cnodes[r]);
+ continue;
+ }
+
+ /* Two children are hit, push far child, and continue with
+ * closer child.
+ */
+ int c0 = __float_as_int(cnodes[r]);
+ float d0 = ((float*)&dist)[r];
+ r = __bscf(child_mask);
+ int c1 = __float_as_int(cnodes[r]);
+ float d1 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ if(d1 < d0) {
+ node_addr = c1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+ continue;
+ }
+ else {
+ node_addr = c0;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ continue;
+ }
+ }
+
+ /* Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there.
+ */
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c1;
+ traversal_stack[stack_ptr].dist = d1;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c0;
+ traversal_stack[stack_ptr].dist = d0;
+
+ /* Three children are hit, push all onto stack and sort 3
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c2 = __float_as_int(cnodes[r]);
+ float d2 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Four children are hit, push all onto stack and sort 4
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c3 = __float_as_int(cnodes[r]);
+ float d3 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c3;
+ traversal_stack[stack_ptr].dist = d3;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c2;
+ traversal_stack[stack_ptr].dist = d2;
+
+ /* Five children are hit, push all onto stack and sort 5
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c4 = __float_as_int(cnodes[r]);
+ float d4 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Six children are hit, push all onto stack and sort 6
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c5 = __float_as_int(cnodes[r]);
+ float d5 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c5;
+ traversal_stack[stack_ptr].dist = d5;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c4;
+ traversal_stack[stack_ptr].dist = d4;
+
+ /* Seven children are hit, push all onto stack and sort 7
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c6 = __float_as_int(cnodes[r]);
+ float d6 = ((float*)&dist)[r];
+ if(child_mask == 0) {
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ /* Eight children are hit, push all onto stack and sort 8
+ * stack items, continue with closest child.
+ */
+ r = __bscf(child_mask);
+ int c7 = __float_as_int(cnodes[r]);
+ float d7 = ((float*)&dist)[r];
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c7;
+ traversal_stack[stack_ptr].dist = d7;
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = c6;
+ traversal_stack[stack_ptr].dist = d6;
+ obvh_stack_sort(&traversal_stack[stack_ptr],
+ &traversal_stack[stack_ptr - 1],
+ &traversal_stack[stack_ptr - 2],
+ &traversal_stack[stack_ptr - 3],
+ &traversal_stack[stack_ptr - 4],
+ &traversal_stack[stack_ptr - 5],
+ &traversal_stack[stack_ptr - 6],
+ &traversal_stack[stack_ptr - 7]);
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+
+ /* If node is leaf, fetch triangle list. */
+ if(node_addr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+ if((__float_as_uint(leaf.z) & visibility) == 0) {
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ continue;
+ }
+
+ int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(prim_addr >= 0) {
+#endif
+ int prim_addr2 = __float_as_int(leaf.y);
+ const uint type = __float_as_int(leaf.w);
+ const uint p_type = type & PRIMITIVE_ALL;
+ bool hit;
+
+ /* Pop. */
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+
+ /* Primitive intersection. */
+ switch(p_type) {
+ case PRIMITIVE_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+ if(hit) {
+ /* Move on to next entry in intersections array. */
+ isect_array++;
+ num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+#endif
+ isect_array->t = isect_t;
+ if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+# if BVH_FEATURE(BVH_MOTION)
+ float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+# else
+ Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+# endif
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+#endif /* BVH_FEATURE(BVH_INSTANCING) */
+ return num_hits;
+ }
+ }
+ }
+ break;
+ }
+#if BVH_FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ for(; prim_addr < prim_addr2; prim_addr++) {
+ kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+ /* Only primitives from volume object. */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+ /* Intersect ray against primitive. */
+ hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+ if(hit) {
+ /* Move on to next entry in intersections array. */
+ isect_array++;
+ num_hits++;
+# if BVH_FEATURE(BVH_INSTANCING)
+ num_hits_in_instance++;
+# endif
+ isect_array->t = isect_t;
+ if(num_hits == max_hits) {
+# if BVH_FEATURE(BVH_INSTANCING)
+# if BVH_FEATURE(BVH_MOTION)
+ float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+# else
+ Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+# endif
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+# endif /* BVH_FEATURE(BVH_INSTANCING) */
+ return num_hits;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ }
+ }
+#if BVH_FEATURE(BVH_INSTANCING)
+ else {
+ /* Instance push. */
+ object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+# if BVH_FEATURE(BVH_MOTION)
+ isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+# else
+ isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+# endif
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect_t);
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ num_hits_in_instance = 0;
+ isect_array->t = isect_t;
+
+ ++stack_ptr;
+ kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+ traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+ node_addr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* Pop. */
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+ }
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+ if(stack_ptr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* Instance pop. */
+ if(num_hits_in_instance) {
+ float t_fac;
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+# else
+ bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+# endif
+ /* Scale isect->t to adjust for instancing. */
+ for(int i = 0; i < num_hits_in_instance; i++) {
+ (isect_array-i-1)->t *= t_fac;
+ }
+ }
+ else {
+# if BVH_FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+# else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+# endif
+ }
+
+ isect_t = tmax;
+ isect_array->t = isect_t;
+
+ obvh_near_far_idx_calc(idir,
+ &near_x, &near_y, &near_z,
+ &far_x, &far_y, &far_z);
+ tfar = avxf(isect_t);
+# if BVH_FEATURE(BVH_HAIR)
+ dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+# endif
+ idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+# ifdef __KERNEL_AVX2__
+ P_idir = P*idir;
+ P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+# endif
+# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+ org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+# endif
+
+ object = OBJECT_NONE;
+ node_addr = traversal_stack[stack_ptr].addr;
+ --stack_ptr;
+ }
+#endif /* FEATURE(BVH_INSTANCING) */
+ } while(node_addr != ENTRYPOINT_SENTINEL);
+
+ return num_hits;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 3036efd4198..2e622af1758 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
/* Axis-aligned nodes intersection */
-ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
const ssef& isect_near,
const ssef& isect_far,
#ifdef __KERNEL_AVX2__
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index a3b23115ae4..0e2994a05d7 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -1,4 +1,4 @@
-/*
+ /*
* Copyright 2014, Blender Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -70,7 +70,479 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
return false;
}
-/* Special ray intersection routines for local intersection. In that case we
+#define cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
+#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
+ccl_device_inline
+#else
+ccl_device_forceinline
+#endif
+int ray_triangle_intersect8(KernelGlobals *kg,
+ float3 ray_P,
+ float3 ray_dir,
+ Intersection **isect,
+ uint visibility,
+ int object,
+ __m256 *triA,
+ __m256 *triB,
+ __m256 *triC,
+ int prim_addr,
+ int prim_num,
+ uint *num_hits,
+ uint max_hits,
+ int *num_hits_in_instance,
+ float isec_t)
+{
+
+ const unsigned char prim_num_mask = (1 << prim_num) - 1;
+
+ const __m256i zero256 = _mm256_setzero_si256();
+
+ const __m256 Px256 = _mm256_set1_ps(ray_P.x);
+ const __m256 Py256 = _mm256_set1_ps(ray_P.y);
+ const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
+
+ const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
+ const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
+ const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
+
+ /* Calculate vertices relative to ray origin. */
+ /* const float3 v0 = tri_c - P;
+ const float3 v1 = tri_a - P;
+ const float3 v2 = tri_b - P; */
+
+ __m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
+ __m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
+ __m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
+
+ __m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
+ __m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
+ __m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
+
+ __m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
+ __m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
+ __m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
+
+ __m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
+ __m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
+ __m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
+
+ __m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
+ __m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
+ __m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
+
+ __m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
+ __m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
+ __m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
+
+ /* Calculate triangle edges.
+ const float3 e0 = v2 - v0;
+ const float3 e1 = v0 - v1;
+ const float3 e2 = v1 - v2;*/
+
+ __m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
+ __m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
+ __m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
+
+ __m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
+ __m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
+ __m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
+
+ __m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
+ __m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
+ __m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
+
+ /* Perform edge tests.
+ const float U = dot(cross(v2 + v0, e0), ray_dir);
+ const float V = dot(cross(v0 + v1, e1), ray_dir);
+ const float W = dot(cross(v1 + v2, e2), ray_dir);*/
+
+ //cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx)
+ __m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
+ __m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
+ __m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
+ //vertical dot
+ __m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
+ U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
+ U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
+
+ __m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
+ __m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
+ __m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
+ //vertical dot
+ __m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
+ V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
+ V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
+
+ __m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
+ __m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
+ __m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
+ //vertical dot
+ __m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
+ W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
+ W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
+
+ //const float minUVW = min(U, min(V, W));
+ //const float maxUVW = max(U, max(V, W));
+#if 0
+ __m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
+ __m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
+
+ //if(minUVW < 0.0f && maxUVW > 0.0f)
+ __m256i mask_minmaxUVW_256 = _mm256_and_si256(
+ _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
+ //_mm256_castps_si256(minUVW_256),
+ _mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
+#else
+ __m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
+ __m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
+ __m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
+ __m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
+
+ const __m256i one256 = _mm256_set1_epi32(1);
+ const __m256i two256 = _mm256_set1_epi32(2);
+
+ __m256i mask_minmaxUVW_256 = _mm256_or_si256(
+ _mm256_cmpeq_epi32(one256, UVW_256_1),
+ _mm256_cmpeq_epi32(two256, UVW_256_1) );
+#endif
+
+ unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
+ if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
+ return false;
+ }
+
+ /* Calculate geometry normal and denominator. */
+ // const float3 Ng1 = cross(e1, e0);
+ //const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+
+ __m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
+ __m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
+ __m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
+
+ //const float3 Ng = Ng1 + Ng1;
+ Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
+ Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
+ Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
+
+ //const float den = dot3(Ng, dir);
+ //vertical dot
+ __m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
+ den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
+ den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
+
+ // __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+
+ /* Perform depth test. */
+ //const float T = dot3(v0, Ng);
+ __m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
+ T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
+ T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
+
+ //const int sign_den = (__float_as_int(den) & 0x80000000);
+ const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
+ __m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
+
+ //const float sign_T = xor_signmask(T, sign_den);
+ __m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
+
+ /*if((sign_T < 0.0f) || mask_minmaxUVW_pos { return false;} */
+ unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
+ if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
+ return false;
+ } /**/
+
+ __m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
+
+
+ ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
+ ccl_align(32) unsigned int mask_minmaxUVW8[8];
+
+ if(visibility == PATH_RAY_SHADOW_OPAQUE){
+ __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
+
+ __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+
+ __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
+ __m256 rayt_256 = _mm256_set1_ps((*isect)->t);
+
+ __m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
+ _mm256_castps_si256(
+ _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
+ )
+ );
+ /* __m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
+ _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
+ _CMP_GT_OS
+ ) );*/
+
+ mask0 = _mm256_or_si256(mask1, mask0);
+ //unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
+ //unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
+ //unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
+ mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
+ mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
+
+ unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
+ if((mask_final & prim_num_mask) == 0) { //all bits NOT set
+ return false;
+ } /**/
+
+ unsigned long i = 0;
+#if defined(_MSC_VER)
+ unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
+#else
+ i = __builtin_ffs(mask_final)-1;
+#endif
+
+ den_256 = _mm256_rcp_ps(den_256); //inv_den
+ U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
+ V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
+ T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
+
+ _mm256_store_ps(U8, U_256);
+ _mm256_store_ps(V8, V_256);
+ _mm256_store_ps(T8, T_256);
+
+
+ //here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
+
+ (*isect)->u = U8[i];
+ (*isect)->v = V8[i];
+ (*isect)->t = T8[i];
+
+ (*isect)->prim = (prim_addr + i);
+ (*isect)->object = object;
+ (*isect)->type = PRIMITIVE_TRIANGLE;
+
+ return true;
+ }
+ else {
+ _mm256_store_ps(den8, den_256);
+ _mm256_store_ps(U8, U_256);
+ _mm256_store_ps(V8, V_256);
+ _mm256_store_ps(T8, T_256);
+
+ _mm256_store_ps(sign_T8, sign_T_256);
+ _mm256_store_ps(xor_signmask8, xor_signmask_256);
+ _mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
+
+ int ret = false;
+
+ if(visibility == PATH_RAY_SHADOW) {
+ for(int i = 0; i < prim_num; i++) {
+ if(!mask_minmaxUVW8[i]) {
+#ifdef __VISIBILITY_FLAG__
+ if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+#endif
+ {
+ if((sign_T8[i] >= 0.0f) &&
+ (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
+ {
+ if(den8[i]) {
+ const float inv_den = 1.0f / den8[i];
+
+ (*isect)->u = U8[i] * inv_den;
+ (*isect)->v = V8[i] * inv_den;
+ (*isect)->t = T8[i] * inv_den;
+
+ (*isect)->prim = (prim_addr + i);
+ (*isect)->object = object;
+ (*isect)->type = PRIMITIVE_TRIANGLE;
+
+ int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
+ int shader = 0;
+
+#ifdef __HAIR__
+ if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+ {
+ shader = kernel_tex_fetch(__tri_shader, prim);
+ }
+#ifdef __HAIR__
+ else {
+ float4 str = kernel_tex_fetch(__curves, prim);
+ shader = __float_as_int(str.z);
+ }
+#endif
+ int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+ /* if no transparent shadows, all light is blocked */
+ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+ return 2;
+ }
+ /* if maximum number of hits reached, block all light */
+ else if(*num_hits == max_hits) {
+ return 2;
+ }
+ /* move on to next entry in intersections array */
+ ret = true;
+
+ (*isect)++;
+ (*num_hits)++;
+
+ (*num_hits_in_instance)++;
+
+ (*isect)->t = isec_t;
+
+ } //den
+ } //if sign
+ } //vis
+ }//if mask
+ } //for
+ }
+ else { //default case
+ for(int i = 0; i < prim_num; i++) {
+ if(!mask_minmaxUVW8[i]) {
+#ifdef __VISIBILITY_FLAG__
+ if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+#endif
+ {
+ if((sign_T8[i] >= 0.0f) &&
+ (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
+ {
+ if(den8[i]) {
+ const float inv_den = 1.0f / den8[i];
+
+ (*isect)->u = U8[i] * inv_den;
+ (*isect)->v = V8[i] * inv_den;
+ (*isect)->t = T8[i] * inv_den;
+
+ (*isect)->prim = (prim_addr + i);
+ (*isect)->object = object;
+ (*isect)->type = PRIMITIVE_TRIANGLE;
+
+ ret = true;
+ } //den
+ } //if sign
+ } //vis
+ }//if mask
+ } //for
+ } //default
+ return ret;
+}// else PATH_RAY_SHADOW_OPAQUE
+
+}
+
+//vz static
+ccl_device_inline
+int triangle_intersect8(KernelGlobals *kg,
+ Intersection **isect,
+ float3 P,
+ float3 dir,
+ uint visibility,
+ int object,
+ int prim_addr,
+ int prim_num,
+ uint *num_hits,
+ uint max_hits,
+ int *num_hits_in_instance,
+ float isec_t)
+ {
+ __m128 tri_a[8], tri_b[8], tri_c[8];
+ __m256 tritmp[12], tri[12];
+ __m256 triA[3], triB[3], triC[3];
+
+ int i, r;
+
+ uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+ for(i = 0; i < prim_num; i++) {
+ tri_a[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+ tri_b[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+ tri_c[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+ }
+ //create 9 or 12 placeholders
+ tri[0] = _mm256_castps128_ps256(tri_a[0]); //_mm256_zextps128_ps256
+ tri[1] = _mm256_castps128_ps256(tri_b[0]);//_mm256_zextps128_ps256
+ tri[2] = _mm256_castps128_ps256(tri_c[0]);//_mm256_zextps128_ps256
+
+ tri[3] = _mm256_castps128_ps256(tri_a[1]); //_mm256_zextps128_ps256
+ tri[4] = _mm256_castps128_ps256(tri_b[1]);//_mm256_zextps128_ps256
+ tri[5] = _mm256_castps128_ps256(tri_c[1]);//_mm256_zextps128_ps256
+
+ tri[6] = _mm256_castps128_ps256(tri_a[2]); //_mm256_zextps128_ps256
+ tri[7] = _mm256_castps128_ps256(tri_b[2]);//_mm256_zextps128_ps256
+ tri[8] = _mm256_castps128_ps256(tri_c[2]);//_mm256_zextps128_ps256
+
+ if(prim_num > 3) {
+ tri[9] = _mm256_castps128_ps256(tri_a[3]); //_mm256_zextps128_ps256
+ tri[10] = _mm256_castps128_ps256(tri_b[3]);//_mm256_zextps128_ps256
+ tri[11] = _mm256_castps128_ps256(tri_c[3]);//_mm256_zextps128_ps256
+ }
+
+ for(i = 4, r = 0; i < prim_num; i ++, r += 3) {
+ tri[r] = _mm256_insertf128_ps(tri[r] , tri_a[i], 1);
+ tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
+ tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
+ }
+
+ //------------------------------------------------
+ //0! Xa0 Ya0 Za0 1 Xa4 Ya4 Za4 1
+ //1! Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
+ //2! Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
+
+ //3! Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
+ //4! Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5 1
+ //5! Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
+
+ //6! Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
+ //7! Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6 1
+ //8! Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
+
+ //9! Xa3 Ya3 Za3 1 Xa7 Ya7 Za7 1
+ //10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7 1
+ //11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7 1
+
+ //"transpose"
+ tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]); //0! Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
+ tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]); //1! Za0 Za1 1 1 Za4 Za5 1 1
+
+ tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]); //2! Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
+ tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]); //3! Za2 Za3 1 1 Za6 Za7 1 1
+
+ tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]); //4! Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
+ tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]); //5! Zb0 Zb1 1 1 Zb4 Zb5 1 1
+
+ tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]); //6! Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
+ tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]); //7! Zb2 Zb3 1 1 Zb6 Zb7 1 1
+
+ tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]); //8! Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
+ tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]); //9! Zc0 Zc1 1 1 Zc4 Zc5 1 1
+
+ tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
+ tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); //11! Zc2 Zc3 1 1 Zc6 Zc7 1 1
+
+ /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+ triA[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
+ triA[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); // Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
+ triA[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), _mm256_castps_pd(tritmp[3]))); // Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
+
+ triB[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Xb0 Xb1 Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
+ triB[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); // Yb0 Yb1 Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
+ triB[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), _mm256_castps_pd(tritmp[7]))); // Zb0 Zb1 Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
+
+ triC[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
+ triC[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10]))); //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
+ triC[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), _mm256_castps_pd(tritmp[11]))); //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
+
+ /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+
+ int result = ray_triangle_intersect8(kg, P,
+ dir,
+ isect,
+ visibility, object,
+ triA,
+ triB,
+ triC,
+ prim_addr,
+ prim_num,
+ num_hits,
+ max_hits,
+ num_hits_in_instance,
+ isec_t);
+ return result;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
* only want to intersect with primitives in the same object, and if case of
* multiple hits we pick a single random primitive as the intersection point.
*/
@@ -82,7 +554,7 @@ ccl_device_inline void triangle_intersect_local(
float3 P,
float3 dir,
int object,
- int local_object,
+ int local_object,
int prim_addr,
float tmax,
uint *lcg_state,
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index d26b668cb11..48a319a2726 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -71,15 +71,13 @@ CCL_NAMESPACE_BEGIN
/* Texture types to be compatible with CUDA textures. These are really just
* simple arrays and after inlining fetch hopefully revert to being a simple
* pointer lookup. */
-
template<typename T> struct texture {
ccl_always_inline const T& fetch(int index)
{
kernel_assert(index >= 0 && index < width);
return data[index];
}
-
-#ifdef __KERNEL_AVX__
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
* compatibility with existing indicies and data structures.
*/
@@ -90,7 +88,6 @@ template<typename T> struct texture {
ssef *ssef_node_data = &ssef_data[index];
return _mm256_loadu_ps((float *)ssef_node_data);
}
-
#endif
#ifdef __KERNEL_SSE2__
@@ -148,6 +145,10 @@ ccl_device_inline void print_sse3i(const char *label, sse3i& a)
print_ssei(label, a.z);
}
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+#endif
+
#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 11657003259..c22f80ebee5 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1377,6 +1377,7 @@ typedef enum KernelBVHLayout {
BVH_LAYOUT_BVH2 = (1 << 0),
BVH_LAYOUT_BVH4 = (1 << 1),
+ BVH_LAYOUT_BVH8 = (1 << 2),
BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4,
BVH_LAYOUT_ALL = (unsigned int)(-1),
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 24043e2231b..8ffe9fd371a 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -76,6 +76,7 @@ set(SRC_HEADERS
util_sky_model.h
util_sky_model_data.h
util_avxf.h
+ util_avxb.h
util_sseb.h
util_ssef.h
util_ssei.h
@@ -96,7 +97,9 @@ set(SRC_HEADERS
util_types_float3_impl.h
util_types_float4.h
util_types_float4_impl.h
- util_types_int2.h
+ util_types_float8.h
+ util_types_float8_impl.h
+ util_types_int2.h
util_types_int2_impl.h
util_types_int3.h
util_types_int3_impl.h
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
new file mode 100644
index 00000000000..670a33f9ae6
--- /dev/null
+++ b/intern/cycles/util/util_avxb.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxf;
+
+/*! 4-wide SSE bool type. */
+struct avxb
+{
+ typedef avxb Mask; // mask type
+ typedef avxf Float; // float type
+
+ enum { size = 8 }; // number of SIMD elements
+ union { __m256 m256; int32_t v[8]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxb ( ) {}
+ __forceinline avxb ( const avxb& other ) { m256 = other.m256; }
+ __forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; }
+
+ __forceinline avxb( const __m256 input ) : m256(input) {}
+ __forceinline operator const __m256&( void ) const { return m256; }
+ __forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
+ __forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
+
+ //__forceinline avxb ( bool a )
+ // : m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+ //__forceinline avxb ( bool a, bool b)
+ // : m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ //__forceinline avxb ( bool a, bool b, bool c, bool d)
+ // : m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ //__forceinline avxb(int mask) {
+ // assert(mask >= 0 && mask < 16);
+ // m128 = _mm_lookupmask_ps[mask];
+ //}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
+ __forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
+ __forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
+__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
+__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
+__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
+__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+
+__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
+#if defined(__KERNEL_SSE41__)
+ return _mm256_blendv_ps(f, t, m);
+#else
+ return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
+__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
+
+#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
+ ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7> __forceinline const avxb shuffle( const avxb& a ) {
+ return _mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0));
+}
+
+/*
+template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
+ return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
+ return _mm_movehl_ps(a, a);
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
+ return _mm_movelh_ps(a, b);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
+ return _mm_movehl_ps(b, a);
+}
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
+template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
+#endif
+
+#if defined(__KERNEL_SSE41__)
+template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
+template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
+#endif
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reduction Operations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
+#else
+__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+
+ bool(a[5])+bool(a[6])+bool(a[7]); }
+#endif
+
+__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; }
+__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; }
+__forceinline bool all ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; }
+__forceinline bool any ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; }
+__forceinline bool none ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; }
+
+__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxb(const char *label, const avxb &a)
+{
+ printf("%s: %df %df %df %df %df %df %df %d\n",
+ label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+//#endif
+
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 2451213963a..81365f96c94 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -19,7 +19,8 @@
CCL_NAMESPACE_BEGIN
-#ifdef __KERNEL_AVX__
+struct avxb;
+
struct avxf
{
typedef avxf Float;
@@ -53,6 +54,9 @@ struct avxf
__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
+ __forceinline avxf(float3 a) :
+ m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {}
+
__forceinline avxf(int a3, int a2, int a1, int a0)
{
@@ -73,8 +77,24 @@ struct avxf
m256 = _mm256_insertf128_ps(foo, b, 1);
}
+ __forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; }
+ __forceinline float& operator [](const size_t i) { assert(i < 8); return f[i]; }
};
+__forceinline avxf cross(const avxf& a, const avxf& b)
+{
+ avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5],
+ 0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]);
+ return r;
+}
+
+__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2)
+{
+ const avxf t = _mm256_mul_ps(a.m256, b.m256);
+ den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+ den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6];
+}
+
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
@@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256
__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
+__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); }
+__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); }
+
////////////////////////////////////////////////////////////////////////////////
/// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
@@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
}
+//#if defined(__KERNEL_SSE41__)
+__forceinline avxf maxi(const avxf& a, const avxf& b) {
+ const avxf ci = _mm256_max_ps(a, b);
+ return ci;
+}
+
+__forceinline avxf mini(const avxf& a, const avxf& b) {
+ const avxf ci = _mm256_min_ps(a, b);
+ return ci;
+}
+//#endif
+
////////////////////////////////////////////////////////////////////////////////
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
@@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
return c-(a*b);
#endif
}
+__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
+ return _mm256_fmsub_ps(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __KERNEL_AVX2__
+__forceinline const avxb operator <=(const avxf& a, const avxf& b) {
+ return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
+}
+#endif
+
#endif
#ifndef _mm256_set_m128
@@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
CCL_NAMESPACE_END
-#endif
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 9a66a372822..a761c9c46de 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -57,7 +57,19 @@ void DebugFlags::CPU::reset()
#undef STRINGIFY
#undef CHECK_CPU_FLAGS
- bvh_layout = BVH_LAYOUT_DEFAULT;
+ if (getenv("CYCLES_BVH2") != NULL) {
+ bvh_layout = BVH_LAYOUT_BVH2;
+ }
+ else if (getenv("CYCLES_BVH4") != NULL) {
+ bvh_layout = BVH_LAYOUT_BVH4;
+ }
+ else if (getenv("CYCLES_BVH8") != NULL) {
+ bvh_layout = BVH_LAYOUT_BVH8;
+ }
+ else {
+ bvh_layout = BVH_LAYOUT_DEFAULT;
+ }
+
split_kernel = false;
}
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 84206a7ba5a..8ac075fe344 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -119,6 +119,7 @@ CCL_NAMESPACE_END
#include "util/util_types_float2.h"
#include "util/util_types_float3.h"
#include "util/util_types_float4.h"
+#include "util/util_types_float8.h"
#include "util/util_types_vector3.h"
@@ -138,6 +139,7 @@ CCL_NAMESPACE_END
#include "util/util_types_float2_impl.h"
#include "util/util_types_float3_impl.h"
#include "util/util_types_float4_impl.h"
+#include "util/util_types_float8_impl.h"
#include "util/util_types_vector3_impl.h"
@@ -146,8 +148,11 @@ CCL_NAMESPACE_END
# include "util/util_sseb.h"
# include "util/util_ssei.h"
# include "util/util_ssef.h"
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+# include "util/util_avxb.h"
# include "util/util_avxf.h"
#endif
+#endif
#endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h
new file mode 100644
index 00000000000..2e693d49634
--- /dev/null
+++ b/intern/cycles/util/util_types_float8.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_H__
+#define __UTIL_TYPES_FLOAT8_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct ccl_try_align(16) float8 {
+#ifdef __KERNEL_AVX2__
+ union {
+ __m256 m256;
+ struct { float a, b, c, d, e, f, g, h; };
+ };
+
+ __forceinline float8();
+ __forceinline float8(const float8& a);
+ __forceinline explicit float8(const __m256& a);
+
+ __forceinline operator const __m256&(void) const;
+ __forceinline operator __m256&(void);
+
+ __forceinline float8& operator =(const float8& a);
+
+#else /* __KERNEL_AVX2__ */
+ float a, b, c, d, e, f, g, h;
+#endif /* __KERNEL_AVX2__ */
+
+ __forceinline float operator[](int i) const;
+ __forceinline float& operator[](int i);
+};
+
+ccl_device_inline float8 make_float8(float f);
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+ float e, float f, float g, float h);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT8_H__ */
diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h
new file mode 100644
index 00000000000..4fac03569e9
--- /dev/null
+++ b/intern/cycles/util/util_types_float8_impl.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
+#define __UTIL_TYPES_FLOAT8_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_AVX2__
+__forceinline float8::float8()
+{
+}
+
+__forceinline float8::float8(const float8& f)
+ : m256(f.m256)
+{
+}
+
+__forceinline float8::float8(const __m256& f)
+ : m256(f)
+{
+}
+
+__forceinline float8::operator const __m256&(void) const
+{
+ return m256;
+}
+
+__forceinline float8::operator __m256&(void)
+{
+ return m256;
+}
+
+__forceinline float8& float8::operator =(const float8& f)
+{
+ m256 = f.m256;
+ return *this;
+}
+#endif /* __KERNEL_AVX2__ */
+
+__forceinline float float8::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 8);
+ return *(&a + i);
+}
+
+__forceinline float& float8::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 8);
+ return *(&a + i);
+}
+
+ccl_device_inline float8 make_float8(float f)
+{
+#ifdef __KERNEL_AVX2__
+ float8 r(_mm256_set1_ps(f));
+#else
+ float8 r = {f, f, f, f, f, f, f, f};
+#endif
+ return r;
+}
+
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+ float e, float f, float g, float h)
+{
+#ifdef __KERNEL_AVX2__
+ float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
+#else
+ float8 r = {a, b, c, d, e, f, g, h};
+#endif
+ return r;
+}
+
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */