Cycles: Add BVH8 and packeted triangle intersection

This is an initial implementation of BVH8 optimization structure and packated triangle intersection. The aim is to get faster ray to scene intersection checks. Scene BVH4 BVH8 barbershop_interior 10:24.94 10:10.74 bmw27 02:41.25 02:38.83 classroom 08:16.49 07:56.15 fishy_cat 04:24.56 04:17.29 koro 06:03.06 06:01.45 pavillon_barcelona 09:21.26 09:02.98 victor 23:39.65 22:53.71 As memory goes, peak usage raises by about 4.7% in a complex scenes. Note that BVH8 is disabled when using OSL, this is because OSL kernel does not get per-microarchitecture optimizations and hence always considers BVH3 is used. Original BVH8 patch from Anton Gavrikov. Batched triangles intersection from Victoria Zhislina. Extra work and tests and fixes from Maxym Dmytrychenko.
author: Sergey Sharybin <sergey.vfx@gmail.com> 2018-02-14 13:23:30 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2018-08-29 16:03:09 +0300
commit: 73f20560529457ea177cb93e8e8eaaf44a589643 (patch)
tree: 45ea2ebad9adabcedd7833629421909ede9f6fb5 /intern/cycles
parent: 66f8a4c07e8a5fc166579101933264b8425a7cd1 (diff)
37 files changed, 5079 insertions, 65 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index c97d942af9d..80b83c94012 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -50,6 +50,7 @@ enum_displacement_methods = (
 enum_bvh_layouts = (
     ('BVH2', "BVH2", "", 1),
     ('BVH4', "BVH4", "", 2),
+    ('BVH8', "BVH8", "", 4),
 )
 
 enum_bvh_types = (
@@ -686,7 +687,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_bvh_layout = EnumProperty(
             name="BVH Layout",
             items=enum_bvh_layouts,
-            default='BVH4',
+            default='BVH8',
         )
         cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index fedcb874269..fcdadf0ad6e 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -679,7 +679,15 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 		params.texture_limit = 0;
 	}
 
-	params.bvh_layout = DebugFlags().cpu.bvh_layout;
+	/* TODO(sergey): Once OSL supports per-microarchitecture optimization get
+	 * rid of this.
+	 */
+	if (params.shadingsystem == SHADINGSYSTEM_OSL) {
+		params.bvh_layout = BVH_LAYOUT_BVH4;
+	}
+	else {
+		params.bvh_layout = DebugFlags().cpu.bvh_layout;
+	}
 
 	return params;
 }
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index b8171e7f70d..fcd28572fdf 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -10,6 +10,7 @@ set(SRC
 	bvh.cpp
 	bvh2.cpp
 	bvh4.cpp
+	bvh8.cpp
 	bvh_binning.cpp
 	bvh_build.cpp
 	bvh_node.cpp
@@ -22,6 +23,7 @@ set(SRC_HEADERS
 	bvh.h
 	bvh2.h
 	bvh4.h
+	bvh8.h
 	bvh_binning.h
 	bvh_build.h
 	bvh_node.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index b524ca07d8d..bc73a3ad264 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -22,6 +22,7 @@
 
 #include "bvh/bvh2.h"
 #include "bvh/bvh4.h"
+#include "bvh/bvh8.h"
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_node.h"
 
@@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout)
 	switch(layout) {
 		case BVH_LAYOUT_BVH2: return "BVH2";
 		case BVH_LAYOUT_BVH4: return "BVH4";
+		case BVH_LAYOUT_BVH8: return "BVH8";
 		case BVH_LAYOUT_NONE: return "NONE";
 		case BVH_LAYOUT_ALL:  return "ALL";
 	}
@@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 			return new BVH2(params, objects);
 		case BVH_LAYOUT_BVH4:
 			return new BVH4(params, objects);
+		case BVH_LAYOUT_BVH8:
+			return new BVH8(params, objects);
 		case BVH_LAYOUT_NONE:
 		case BVH_LAYOUT_ALL:
 			break;
@@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
 			}
 		}
 		visibility |= ob->visibility_for_tracing();
+
+	}
+}
+
+bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh)
+{
+	if(node->is_leaf()) {
+		return node->is_unaligned;
+	}
+	else {
+		return node_is_unaligned(node, bvh);
+	}
+}
+
+bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh)
+{
+	const BVHNode *node0 = node->get_child(0);
+	const BVHNode *node1 = node->get_child(1);
+
+	switch(bvh) {
+		case bvh2:
+			return node0->is_unaligned || node1->is_unaligned;
+			break;
+		case bvh4:
+			return leaf_check(node0, bvh2) || leaf_check(node1, bvh2);
+			break;
+		case bvh8:
+			return leaf_check(node0, bvh4) || leaf_check(node1, bvh4);
+			break;
+		default:
+			assert(0);
+			return false;
 	}
 }
 
@@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	 * BVH's are stored in global arrays. This function merges them into the
 	 * top level BVH, adjusting indexes and offsets where appropriate.
 	 */
-	/* TODO(sergey): This code needs adjustment for wider BVH than 4. */
 	const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
+	const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
 
 	/* Adjust primitive index to point to the triangle in the global array, for
 	 * meshes with transform applied and already in the top level BVH.
@@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
 				size_t nsize, nsize_bbox;
 				if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
-					nsize = use_qbvh
-					            ? BVH_UNALIGNED_QNODE_SIZE
-					            : BVH_UNALIGNED_NODE_SIZE;
-					nsize_bbox = (use_qbvh)? 13: 0;
+					if(use_obvh) {
+						nsize = BVH_UNALIGNED_ONODE_SIZE;
+						nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1;
+					}
+					else {
+						nsize = use_qbvh
+							? BVH_UNALIGNED_QNODE_SIZE
+							: BVH_UNALIGNED_NODE_SIZE;
+						nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0;
+					}
 				}
 				else {
-					nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
-					nsize_bbox = (use_qbvh)? 7: 0;
+					if(use_obvh) {
+						nsize = BVH_ONODE_SIZE;
+						nsize_bbox = BVH_ONODE_SIZE-1;
+					}
+					else {
+						nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+						nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0;
+					}
 				}
 
 				memcpy(pack_nodes + pack_nodes_offset,
@@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
 				/* Modify offsets into arrays */
 				int4 data = bvh_nodes[i + nsize_bbox];
-
-				data.z += (data.z < 0)? -noffset_leaf: noffset;
-				data.w += (data.w < 0)? -noffset_leaf: noffset;
-
-				if(use_qbvh) {
-					data.x += (data.x < 0)? -noffset_leaf: noffset;
-					data.y += (data.y < 0)? -noffset_leaf: noffset;
+				int4 data1 = bvh_nodes[i + nsize_bbox-1];
+				if(use_obvh) {
+					data.z += (data.z < 0) ? -noffset_leaf : noffset;
+					data.w += (data.w < 0) ? -noffset_leaf : noffset;
+					data.x += (data.x < 0) ? -noffset_leaf : noffset;
+					data.y += (data.y < 0) ? -noffset_leaf : noffset;
+					data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
+					data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
+					data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
+					data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
+				}
+				else {
+					data.z += (data.z < 0) ? -noffset_leaf : noffset;
+					data.w += (data.w < 0) ? -noffset_leaf : noffset;
+					if(use_qbvh) {
+						data.x += (data.x < 0)? -noffset_leaf: noffset;
+						data.y += (data.y < 0)? -noffset_leaf: noffset;
+					}
 				}
-
 				pack_nodes[pack_nodes_offset + nsize_bbox] = data;
+				if(use_obvh) {
+					pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
+				}
 
 				/* Usually this copies nothing, but we better
 				 * be prepared for possible node size extension.
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 6a82f915692..86be0bae4be 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -73,6 +73,12 @@ struct PackedBVH {
 	}
 };
 
+enum BVH_TYPE {
+	bvh2,
+	bvh4,
+	bvh8
+};
+
 /* BVH */
 
 class BVH
@@ -93,6 +99,8 @@ protected:
 
 	/* Refit range of primitives. */
 	void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
+	static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh);
+	static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh);
 
 	/* triangles and strands */
 	void pack_primitives();
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
index 9d89d2b6afb..4a423c16559 100644
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -25,13 +25,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	return node0->is_unaligned || node1->is_unaligned;
-}
-
 BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
@@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root)
 	}
 	else {
 		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_bvh_is_unaligned(root)
+		nextNodeIdx += node_is_unaligned(root, bvh2)
 		                       ? BVH_UNALIGNED_NODE_SIZE
 		                       : BVH_NODE_SIZE;
 	}
@@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root)
 				}
 				else {
 					idx[i] = nextNodeIdx;
-					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+					nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2)
 					                       ? BVH_UNALIGNED_NODE_SIZE
 					                       : BVH_NODE_SIZE;
 				}
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
index 4faf47af7bb..a449587e607 100644
--- a/intern/cycles/bvh/bvh4.cpp
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN
  * Perhaps we can merge nodes in actual tree and make our
  * life easier all over the place.
  */
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	bool has_unaligned = false;
-	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned;
-	}
-	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned;
-		has_unaligned |= node0->get_child(1)->is_unaligned;
-	}
-	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned;
-	}
-	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned;
-		has_unaligned |= node1->get_child(1)->is_unaligned;
-	}
-	return has_unaligned;
-}
 
 BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
@@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root)
 	}
 	else {
 		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_qbvh_is_unaligned(root)
+		nextNodeIdx += node_is_unaligned(root, bvh4)
 		                       ? BVH_UNALIGNED_QNODE_SIZE
 		                       : BVH_QNODE_SIZE;
 	}
@@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root)
 				}
 				else {
 					idx = nextNodeIdx;
-					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+					nextNodeIdx += node_is_unaligned(nodes[i], bvh4)
 					                       ? BVH_UNALIGNED_QNODE_SIZE
 					                       : BVH_QNODE_SIZE;
 				}
@@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			                    visibility,
 			                    0.0f,
 			                    1.0f,
-			                    4);
+			                    num_nodes);
 		}
 		else {
 			pack_aligned_node(idx,
@@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			                  visibility,
 			                  0.0f,
 			                  1.0f,
-			                  4);
+			                  num_nodes);
 		}
 	}
 }
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
new file mode 100644
index 00000000000..e8b6726602c
--- /dev/null
+++ b/intern/cycles/bvh/bvh8.cpp
@@ -0,0 +1,515 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "bvh/bvh8.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+	float4 data[BVH_ONODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+		/* object */
+		data[0].x = __int_as_float(~(leaf->lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
+		/* triangle */
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
+	}
+	data[0].z = __uint_as_float(leaf->visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+}
+
+void BVH8::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry *en,
+                      int num)
+{
+	bool has_unaligned = false;
+	/* Check whether we have to create unaligned node or all nodes are aligned
+	 * and we can cut some corner here.
+	 */
+	if(params.use_unaligned_nodes) {
+		for(int i = 0; i < num; i++) {
+			if(en[i].node->is_unaligned) {
+				has_unaligned = true;
+				break;
+			}
+		}
+	}
+	if(has_unaligned) {
+		/* There's no unaligned children, pack into AABB node. */
+		pack_unaligned_inner(e, en, num);
+	}
+	else {
+		/* Create unaligned node with orientation transform for each of the
+		 * children.
+		 */
+		pack_aligned_inner(e, en, num);
+	}
+}
+
+void BVH8::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry *en,
+                              int num)
+{
+	BoundBox bounds[8];
+	int child[8];
+	for(int i = 0; i < num; ++i) {
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_aligned_node(e.idx,
+	                  bounds,
+	                  child,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
+	                  num);
+}
+
+void BVH8::pack_aligned_node(int idx,
+                             const BoundBox *bounds,
+                             const int *child,
+                             const uint visibility,
+                             const float time_from,
+                             const float time_to,
+                             const int num)
+{
+	float8 data[8];
+	memset(data, 0, sizeof(data));
+
+	data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+	data[0].b = time_from;
+	data[0].c = time_to;
+	for(int i = 0; i < num; i++) {
+		float3 bb_min = bounds[i].min;
+		float3 bb_max = bounds[i].max;
+
+		data[1][i] = bb_min.x;
+		data[2][i] = bb_max.x;
+		data[3][i] = bb_min.y;
+		data[4][i] = bb_max.y;
+		data[5][i] = bb_min.z;
+		data[6][i] = bb_max.z;
+
+		data[7][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 8; i++) {
+		/* We store BB which would never be recorded as intersection
+		* so kernel might safely assume there are always 4 child nodes.
+		*/
+		data[1][i] = FLT_MAX;
+		data[2][i] = -FLT_MAX;
+
+		data[3][i] = FLT_MAX;
+		data[4][i] = -FLT_MAX;
+
+		data[5][i] = FLT_MAX;
+		data[6][i] = -FLT_MAX;
+
+		data[7][i] = __int_as_float(0);
+	}
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
+}
+
+void BVH8::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry *en,
+                                int num)
+{
+	Transform aligned_space[8];
+	BoundBox bounds[8];
+	int child[8];
+	for(int i = 0; i < num; ++i) {
+		aligned_space[i] = en[i].node->get_aligned_space();
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_unaligned_node(e.idx,
+	                    aligned_space,
+	                    bounds,
+	                    child,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
+	                    num);
+}
+
+void BVH8::pack_unaligned_node(int idx,
+                               const Transform *aligned_space,
+                               const BoundBox *bounds,
+                               const int *child,
+                               const uint visibility,
+                               const float time_from,
+                               const float time_to,
+                               const int num)
+{
+	float8 data[BVH_UNALIGNED_ONODE_SIZE];
+	memset(data, 0, sizeof(data));
+	data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+	data[0].b = time_from;
+	data[0].c = time_to;
+
+	for(int i = 0; i < num; i++) {
+		Transform space = BVHUnaligned::compute_node_transform(
+		        bounds[i],
+		        aligned_space[i]);
+
+		data[1][i] = space.x.x;
+		data[2][i] = space.x.y;
+		data[3][i] = space.x.z;
+
+		data[4][i] = space.y.x;
+		data[5][i] = space.y.y;
+		data[6][i] = space.y.z;
+
+		data[7][i] = space.z.x;
+		data[8][i] = space.z.y;
+		data[9][i] = space.z.z;
+
+		data[10][i] = space.x.w;
+		data[11][i] = space.y.w;
+		data[12][i] = space.z.w;
+
+		data[13][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 8; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+
+		data[1][i] = 1.0f;
+		data[2][i] = 0.0f;
+		data[3][i] = 0.0f;
+
+		data[4][i] = 0.0f;
+		data[5][i] = 0.0f;
+		data[6][i] = 0.0f;
+
+		data[7][i] = 0.0f;
+		data[8][i] = 0.0f;
+		data[9][i] = 0.0f;
+
+		data[10][i] = -FLT_MAX;
+		data[11][i] = -FLT_MAX;
+		data[12][i] = -FLT_MAX;
+
+		data[13][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH8::pack_nodes(const BVHNode *root)
+{
+	/* Calculate size of the arrays required. */
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
+		        (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_ONODE_SIZE;
+	}
+	/* Resize arrays. */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
+	if(params.top_level) {
+		pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+	}
+	else {
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+	}
+
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+	vector<BVHStackEntry> stack;
+	stack.reserve(BVHParams::MAX_DEPTH*2);
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_is_unaligned(root, bvh8)
+		                   ? BVH_UNALIGNED_ONODE_SIZE
+		                   : BVH_ONODE_SIZE;
+	}
+
+	while(stack.size()) {
+		BVHStackEntry e = stack.back();
+		stack.pop_back();
+
+		if(e.node->is_leaf()) {
+			/* leaf node */
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+			pack_leaf(e, leaf);
+		}
+		else {
+			/* Inner node. */
+			const BVHNode *node = e.node;
+			const BVHNode *node0 = node->get_child(0);
+			const BVHNode *node1 = node->get_child(1);
+			/* Collect nodes. */
+			const BVHNode *nodes[8];
+			int numnodes = 0;
+			if(node0->is_leaf()) {
+				nodes[numnodes++] = node0;
+			}
+			else {
+				const BVHNode *node00 = node0->get_child(0),
+				              *node01 = node0->get_child(1);
+				if(node00->is_leaf()) {
+					nodes[numnodes++] = node00;
+				}
+				else {
+					nodes[numnodes++] = node00->get_child(0);
+					nodes[numnodes++] = node00->get_child(1);
+				}
+				if(node01->is_leaf()) {
+					nodes[numnodes++] = node01;
+				}
+				else {
+					nodes[numnodes++] = node01->get_child(0);
+					nodes[numnodes++] = node01->get_child(1);
+				}
+			}
+			if(node1->is_leaf()) {
+				nodes[numnodes++] = node1;
+			}
+			else {
+				const BVHNode *node10 = node1->get_child(0),
+				              *node11 = node1->get_child(1);
+				if(node10->is_leaf()) {
+					nodes[numnodes++] = node10;
+				}
+				else {
+					nodes[numnodes++] = node10->get_child(0);
+					nodes[numnodes++] = node10->get_child(1);
+				}
+				if(node11->is_leaf()) {
+					nodes[numnodes++] = node11;
+				}
+				else {
+					nodes[numnodes++] = node11->get_child(0);
+					nodes[numnodes++] = node11->get_child(1);
+				}
+			}
+			/* Push entries on the stack. */
+			for(int i = 0; i < numnodes; ++i) {
+				int idx;
+				if(nodes[i]->is_leaf()) {
+					idx = nextLeafNodeIdx++;
+				}
+				else {
+					idx = nextNodeIdx;
+					nextNodeIdx += node_is_unaligned(nodes[i], bvh8)
+						? BVH_UNALIGNED_ONODE_SIZE
+						: BVH_ONODE_SIZE; 
+				}
+				stack.push_back(BVHStackEntry(nodes[i], idx));
+			}
+			/* Set node. */
+			pack_inner(e, &stack[stack.size() - numnodes], numnodes);
+		}
+	}
+	assert(node_size == nextNodeIdx);
+	/* Root index to start traversal at, to handle case of single leaf node. */
+	pack.root_index = (root->is_leaf()) ? -1 : 0;
+}
+
+void BVH8::refit_nodes()
+{
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx];
+		int4 c = data[0];
+		/* Refit leaf node. */
+		for(int prim = c.x; prim < c.y; prim++) {
+			int pidx = pack.prim_index[prim];
+			int tob = pack.prim_object[prim];
+			Object *ob = objects[tob];
+
+			if(pidx == -1) {
+				/* Object instance. */
+				bbox.grow(ob->bounds);
+			}
+			else {
+				/* Primitives. */
+				const Mesh *mesh = ob->mesh;
+
+				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+					/* Curves. */
+					int str_offset = (params.top_level) ? mesh->curve_offset : 0;
+					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+					visibility |= PATH_RAY_CURVE;
+
+					/* Motion curves. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->curve_keys.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *key_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++) {
+								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+							}
+						}
+					}
+				}
+				else {
+					/* Triangles. */
+					int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
+					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+					const float3 *vpos = &mesh->verts[0];
+
+					triangle.bounds_grow(vpos, bbox);
+
+					/* Motion triangles. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->verts.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *vert_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++) {
+								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+							}
+						}
+					}
+				}
+			}
+
+			visibility |= ob->visibility;
+		}
+
+		float4 leaf_data[BVH_ONODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c.x);
+		leaf_data[0].y = __int_as_float(c.y);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(c.w);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+	}
+	else {
+		int4 *data = &pack.nodes[idx];
+		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		int4 c;
+		if(is_unaligned) {
+			c = data[BVH_UNALIGNED_ONODE_SIZE-1];
+		}
+		else {
+			c = data[BVH_ONODE_SIZE-1];
+		}
+		/* Refit inner node, set bbox from children. */
+		BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty,
+		                           BoundBox::empty, BoundBox::empty,
+		                           BoundBox::empty, BoundBox::empty,
+		                           BoundBox::empty, BoundBox::empty };
+		uint child_visibility[8] = { 0 };
+		int num_nodes = 0;
+
+		for(int i = 0; i < 8; ++i) {
+			if(c[i] != 0) {
+				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+				           child_bbox[i], child_visibility[i]);
+				++num_nodes;
+				bbox.grow(child_bbox[i]);
+				visibility |= child_visibility[i];
+			}
+		}
+
+		if(is_unaligned) {
+			Transform aligned_space[8] = { transform_identity(), transform_identity(),
+			                               transform_identity(), transform_identity(),
+			                               transform_identity(), transform_identity(),
+			                               transform_identity(), transform_identity()};
+			pack_unaligned_node(idx,
+			                    aligned_space,
+			                    child_bbox,
+			                    &c[0],
+			                    visibility,
+			                    0.0f,
+			                    1.0f,
+			                    num_nodes);
+		}
+		else {
+			pack_aligned_node(idx,
+			                  child_bbox,
+			                  &c[0],
+			                  visibility,
+			                  0.0f,
+			                  1.0f,
+			                  num_nodes);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
new file mode 100644
index 00000000000..99ff79e60b5
--- /dev/null
+++ b/intern/cycles/bvh/bvh8.h
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __BVH8_H__
+#define __BVH8_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_ONODE_SIZE           16
+#define BVH_ONODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_ONODE_SIZE 28
+
+/* BVH8
+*
+* Octo BVH, with each node having eight children, to use with SIMD instructions.
+*/
+class BVH8 : public BVH {
+protected:
+	/* constructor */
+	friend class BVH;
+	BVH8(const BVHParams& params, const vector<Object*>& objects);
+
+	/* pack */
+	void pack_nodes(const BVHNode *root);
+
+	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry *en,
+	                        int num);
+	void pack_aligned_node(int idx,
+	                       const BoundBox *bounds,
+	                       const int *child,
+	                       const uint visibility,
+	                       const float time_from,
+	                       const float time_to,
+	                       const int num);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry *en,
+	                          int num);
+	void pack_unaligned_node(int idx,
+	                         const Transform *aligned_space,
+	                         const BoundBox *bounds,
+	                         const int *child,
+	                         const uint visibility,
+	                         const float time_from,
+	                         const float time_to,
+	                         const int num);
+
+	/* refit */
+	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH8_H__ */
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 879d07b9625..fd8b4977699 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				}
 			}
 			return cnt;
+		case BVH_STAT_ONODE_COUNT:
+			cnt = 1;
+			for(int i = 0; i < num_children(); i++) {
+				BVHNode *node = get_child(i);
+				if(node->is_leaf()) {
+					cnt += 1;
+				}
+				else {
+					for(int j = 0; j < node->num_children(); j++)
+					{
+						BVHNode *node_next = node->get_child(j);
+						if(node_next->is_leaf()) {
+							cnt += 1;
+						}
+						else {
+							for(int k = 0; k < node_next->num_children(); k++) {
+								cnt += node_next->get_child(k)->getSubtreeSize(stat);
+							}
+						}
+					}
+				}
+			}
+			return cnt;
+		case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT:
+			{
+				bool has_unaligned = false;
+				for(int i = 0; i < num_children(); i++) {
+					BVHNode *node = get_child(i);
+					if(node->is_leaf()) {
+						has_unaligned |= node->is_unaligned;
+					}
+					else {
+						for(int j = 0; j < node->num_children(); j++) {
+							BVHNode *node_next = node->get_child(j);
+							if(node_next->is_leaf()) {
+								has_unaligned |= node_next->is_unaligned;
+							}
+							else {
+								for(int k = 0; k < node_next->num_children(); k++) {
+									cnt += node_next->get_child(k)->getSubtreeSize(stat);
+									has_unaligned |= node_next->get_child(k)->is_unaligned;
+								}
+							}
+						}
+					}
+				}
+				cnt += has_unaligned? 1: 0;
+			}
+			return cnt;
 		case BVH_STAT_ALIGNED_COUNT:
 			if(!is_unaligned) {
 				cnt = 1;
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 94cf5ab730c..ed89d52a50a 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -39,6 +39,8 @@ enum BVH_STAT {
 	BVH_STAT_ALIGNED_LEAF_COUNT,
 	BVH_STAT_UNALIGNED_LEAF_COUNT,
 	BVH_STAT_DEPTH,
+	BVH_STAT_ONODE_COUNT,
+	BVH_STAT_UNALIGNED_INNER_ONODE_COUNT,
 };
 
 class BVHParams;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index e92bbbfa6e6..5b3761f8353 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -1031,6 +1031,9 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	if(system_cpu_support_sse2()) {
 		info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
 	}
+	if (system_cpu_support_avx2()) {
+		info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
+	}
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 118e18374a1..c6e92c6d89d 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS
 	bvh/qbvh_traversal.h
 	bvh/qbvh_volume.h
 	bvh/qbvh_volume_all.h
+	bvh/obvh_nodes.h
+	bvh/obvh_shadow_all.h
+	bvh/obvh_local.h
+	bvh/obvh_traversal.h
+	bvh/obvh_volume.h
+	bvh/obvh_volume_all.h
 )
 
 set(SRC_HEADERS
@@ -270,6 +276,8 @@ set(SRC_UTIL_HEADERS
 	../util/util_types_float3_impl.h
 	../util/util_types_float4.h
 	../util/util_types_float4_impl.h
+	../util/util_types_float8.h
+	../util/util_types_float8_impl.h
 	../util/util_types_int2.h
 	../util/util_types_int2_impl.h
 	../util/util_types_int3.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index de1503e5564..2ad55d041bf 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN
 /* Common QBVH functions. */
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_nodes.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_nodes.h"
+#endif
 #endif
 
 /* Regular BVH traversal */
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 6356c197dd1..8def71bc890 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_local.h"
+#  ifdef __KERNEL_AVX2__
+#    include "kernel/bvh/obvh_local.h"
+#  endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -254,6 +257,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          int max_hits)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    local_isect,
+			                                    local_object,
+			                                    lcg_state,
+			                                    max_hits);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index cfc567ff9ca..d525b29fd94 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_shadow_all.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_shadow_all.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          uint *num_hits)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect_array,
+			                                    visibility,
+			                                    max_hits,
+			                                    num_hits);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 551625eae78..e95d2408201 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -20,6 +20,9 @@
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_traversal.h"
 #endif
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_traversal.h"
+#endif
 
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
@@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          )
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect,
+			                                    visibility
+#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+			                                    , lcg_state,
+			                                    difl,
+			                                    extmax
+#  endif
+			                                    );
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index ead424aaaaf..4ca0dc2225e 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
 #define BVH_QSTACK_SIZE 384
-
+#define BVH_OSTACK_SIZE 768
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING			1
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index ce5fc7be33d..7d03855cb8f 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_volume.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_volume.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint visibility)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect,
+			                                    visibility);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 2ee29ac9c27..3d9b598914f 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_volume_all.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_volume_all.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -386,6 +389,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint visibility)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect_array,
+			                                    max_hits,
+			                                    visibility);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
new file mode 100644
index 00000000000..50bcfa79b6c
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_local.h
@@ -0,0 +1,409 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             LocalIntersection *local_isect,
+                                             int local_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_tex_fetch(__object_node, local_object);
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+
+	local_isect->num_hits = 0;
+
+	const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+		Transform ob_itfm;
+		isect_t = bvh_instance_motion_push(kg,
+		                                   local_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
+#else
+		isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+#endif
+		object = local_object;
+	}
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
+#ifdef __KERNEL_AVX2__
+				                                P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
+
+				int prim_addr2 = __float_as_int(leaf.y);
+				const uint type = __float_as_int(leaf.w);
+
+				/* Pop. */
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+
+				/* Primitive intersection. */
+				switch(type & PRIMITIVE_ALL) {
+					case PRIMITIVE_TRIANGLE: {
+						/* Intersect ray against primitive, */
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+							if(triangle_intersect_local(kg,
+							                            local_isect,
+							                            P,
+							                            dir,
+							                            object,
+							                            local_object,
+							                            prim_addr,
+							                            isect_t,
+							                            lcg_state,
+							                            max_hits))
+							{
+								return true;
+							}
+						}
+						break;
+					}
+#if BVH_FEATURE(BVH_MOTION)
+					case PRIMITIVE_MOTION_TRIANGLE: {
+						/* Intersect ray against primitive. */
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+							if(motion_triangle_intersect_local(kg,
+							                                   local_isect,
+							                                   P,
+							                                   dir,
+							                                   ray->time,
+							                                   object,
+							                                   local_object,
+							                                   prim_addr,
+							                                   isect_t,
+							                                   lcg_state,
+							                                   max_hits))
+							{
+								return true;
+							}
+						}
+						break;
+					}
+#endif
+					default:
+						break;
+				}
+			}
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+	return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h
new file mode 100644
index 00000000000..93f35f6dffb
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_nodes.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Aligned nodes intersection AVX code is adopted from Embree,
+ */
+
+struct OBVHStackItem {
+	int addr;
+	float dist;
+};
+
+ccl_device_inline void obvh_near_far_idx_calc(const float3& idir,
+                                              int *ccl_restrict near_x,
+                                              int *ccl_restrict near_y,
+                                              int *ccl_restrict near_z,
+                                              int *ccl_restrict far_x,
+                                              int *ccl_restrict far_y,
+                                              int *ccl_restrict far_z)
+
+{
+#ifdef __KERNEL_SSE__
+	*near_x = 0; *far_x = 1;
+	*near_y = 2; *far_y = 3;
+	*near_z = 4; *far_z = 5;
+
+	const size_t mask = movemask(ssef(idir.m128));
+
+	const int mask_x = mask & 1;
+	const int mask_y = (mask & 2) >> 1;
+	const int mask_z = (mask & 4) >> 2;
+
+	*near_x += mask_x; *far_x -= mask_x;
+	*near_y += mask_y; *far_y -= mask_y;
+	*near_z += mask_z; *far_z -= mask_z;
+#else
+	if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
+	if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
+	if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
+#endif
+}
+
+ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a,
+                                      OBVHStackItem *ccl_restrict b)
+{
+	OBVHStackItem tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3)
+{
+	if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+	if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+	if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4)
+{
+	if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+	if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); }
+	if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); }
+	if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); }
+	if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5)
+{
+	obvh_stack_sort(s1, s2, s3, s4);
+	if(s5->dist < s4->dist) {
+		obvh_item_swap(s4, s5);
+		if(s4->dist < s3->dist) {
+			obvh_item_swap(s3, s4);
+			if(s3->dist < s2->dist) {
+				obvh_item_swap(s2, s3);
+				if(s2->dist < s1->dist) {
+					obvh_item_swap(s1, s2);
+				}
+			}
+		}
+	}
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6)
+{
+	obvh_stack_sort(s1, s2, s3, s4, s5);
+	if(s6->dist < s5->dist) {
+		obvh_item_swap(s5, s6);
+		if(s5->dist < s4->dist) {
+			obvh_item_swap(s4, s5);
+			if(s4->dist < s3->dist) {
+				obvh_item_swap(s3, s4);
+				if(s3->dist < s2->dist) {
+					obvh_item_swap(s2, s3);
+					if(s2->dist < s1->dist) {
+						obvh_item_swap(s1, s2);
+					}
+				}
+			}
+		}
+	}
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6,
+                                       OBVHStackItem *ccl_restrict s7)
+{
+	obvh_stack_sort(s1, s2, s3, s4, s5, s6);
+	if(s7->dist < s6->dist) {
+		obvh_item_swap(s6, s7);
+		if(s6->dist < s5->dist) {
+			obvh_item_swap(s5, s6);
+			if(s5->dist < s4->dist) {
+				obvh_item_swap(s4, s5);
+				if(s4->dist < s3->dist) {
+					obvh_item_swap(s3, s4);
+					if(s3->dist < s2->dist) {
+						obvh_item_swap(s2, s3);
+						if(s2->dist < s1->dist) {
+							obvh_item_swap(s1, s2);
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6,
+                                       OBVHStackItem *ccl_restrict s7,
+                                       OBVHStackItem *ccl_restrict s8)
+{
+	obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
+	if(s8->dist < s7->dist) {
+		obvh_item_swap(s7, s8);
+		if(s7->dist < s6->dist) {
+			obvh_item_swap(s6, s7);
+			if(s6->dist < s5->dist) {
+				obvh_item_swap(s5, s6);
+				if(s5->dist < s4->dist) {
+					obvh_item_swap(s4, s5);
+					if(s4->dist < s3->dist) {
+						obvh_item_swap(s3, s4);
+						if(s3->dist < s2->dist) {
+							obvh_item_swap(s2, s3);
+							if(s2->dist < s1->dist) {
+								obvh_item_swap(s1, s2);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+                                                  const avxf& isect_near,
+                                                  const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+                                                  const avx3f& org_idir,
+#else
+                                                  const avx3f& org,
+#endif
+                                                  const avx3f& idir,
+                                                  const int near_x,
+                                                  const int near_y,
+                                                  const int near_z,
+                                                  const int far_x,
+                                                  const int far_y,
+                                                  const int far_z,
+                                                  const int node_addr,
+                                                  avxf *ccl_restrict dist)
+{
+	const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+	const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x);
+	const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y);
+	const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z);
+	const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x);
+	const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y);
+	const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z);
+
+	const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const avxb vmask = tnear <= tfar;
+	int mask = (int)movemask(vmask);
+	*dist = tnear;
+	return mask;
+#else
+	return 0;
+#endif
+}
+
+ccl_device_inline int obvh_aligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#else
+        const avx3f& P,
+#endif
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+	const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
+	const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
+	const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
+	const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
+	const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
+	const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const avxb vmask = round_down*tnear <= round_up*tfar;
+	int mask = (int)movemask(vmask);
+	*dist = tnear;
+	return mask;
+#else
+	return 0;
+#endif
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int obvh_unaligned_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& org_idir,
+#endif
+        const avx3f& org,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+	const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+	const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+	const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+	const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+	const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+	const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+	const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+	const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+	const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+	const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+	const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+	const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+	const avxf neg_one(-1.0f);
+	const avxf nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const avxf tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const avxf tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const avxf tnear_x = min(tlower_x, tupper_x);
+	const avxf tnear_y = min(tlower_y, tupper_y);
+	const avxf tnear_z = min(tlower_z, tupper_z);
+	const avxf tfar_x = max(tlower_x, tupper_x);
+	const avxf tfar_y = max(tlower_y, tupper_y);
+	const avxf tfar_z = max(tlower_z, tupper_z);
+	const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const avxb vmask = tnear <= tfar;
+	*dist = tnear;
+	return movemask(vmask);
+}
+
+ccl_device_inline int obvh_unaligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#endif
+        const avx3f& P,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+	const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+	const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+	const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+	const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+	const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+	const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+	const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+	const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+	const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+	const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+	const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+	const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+	const avxf neg_one(-1.0f);
+	const avxf nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const avxf tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const avxf tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+
+	const avxf tnear_x = min(tlower_x, tupper_x);
+	const avxf tnear_y = min(tlower_y, tupper_y);
+	const avxf tnear_z = min(tlower_z, tupper_z);
+	const avxf tfar_x = max(tlower_x, tupper_x);
+	const avxf tfar_y = max(tlower_y, tupper_y);
+	const avxf tfar_z = max(tlower_z, tupper_z);
+
+	const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const avxb vmask = round_down*tnear <= round_up*tfar;
+	*dist = tnear;
+	return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int obvh_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& org_idir,
+#endif
+        const avx3f& org,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return obvh_unaligned_node_intersect(kg,
+		                                     isect_near,
+		                                     isect_far,
+#ifdef __KERNEL_AVX2__
+		                                     org_idir,
+#endif
+		                                     org,
+		                                     dir,
+		                                     idir,
+		                                     near_x, near_y, near_z,
+		                                     far_x, far_y, far_z,
+		                                     node_addr,
+		                                     dist);
+	}
+	else {
+		return obvh_aligned_node_intersect(kg,
+		                                   isect_near,
+		                                   isect_far,
+#ifdef __KERNEL_AVX2__
+		                                   org_idir,
+#else
+		                                   org,
+#endif
+		                                   idir,
+		                                   near_x, near_y, near_z,
+		                                   far_x, far_y, far_z,
+		                                   node_addr,
+		                                   dist);
+	}
+}
+
+ccl_device_inline int obvh_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#endif
+        const avx3f& P,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return obvh_unaligned_node_intersect_robust(kg,
+		                                            isect_near,
+		                                            isect_far,
+#ifdef __KERNEL_AVX2__
+		                                            P_idir,
+#endif
+		                                            P,
+		                                            dir,
+		                                            idir,
+		                                            near_x, near_y, near_z,
+		                                            far_x, far_y, far_z,
+		                                            node_addr,
+		                                            difl,
+		                                            dist);
+	}
+	else {
+		return obvh_aligned_node_intersect_robust(kg,
+		                                          isect_near,
+		                                          isect_far,
+#ifdef __KERNEL_AVX2__
+		                                          P_idir,
+#else
+		                                          P,
+#endif
+		                                          idir,
+		                                          near_x, near_y, near_z,
+		                                          far_x, far_y, far_z,
+		                                          node_addr,
+		                                          difl,
+		                                          dist);
+	}
+}
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
new file mode 100644
index 00000000000..3e877065127
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const int skip_object,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+	/* TODO(sergey):
+	 *  - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
+
+				if(false
+#ifdef __VISIBILITY_FLAG__
+				   || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+#endif
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
+				) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+//#if !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+
+				if(child_mask != 0) {
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					if(p_type == PRIMITIVE_TRIANGLE) {
+						int prim_count = prim_addr2 - prim_addr;
+						if(prim_count < 3) {
+							while(prim_addr < prim_addr2) {
+								kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+								int hit = triangle_intersect(kg,
+								                             isect_array,
+								                             P,
+								                             dir,
+								                             PATH_RAY_SHADOW,
+								                             object,
+								                             prim_addr);
+								/* Shadow ray early termination. */
+								if(hit) {
+									/* detect if this surface has a shader with transparent shadows */
+
+									/* todo: optimize so primitive visibility flag indicates if
+									 * the primitive has a transparent shadow shader? */
+									int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+									int shader = 0;
+
+#ifdef __HAIR__
+									if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+									{
+										shader = kernel_tex_fetch(__tri_shader, prim);
+									}
+#ifdef __HAIR__
+									else {
+										float4 str = kernel_tex_fetch(__curves, prim);
+										shader = __float_as_int(str.z);
+									}
+#endif
+									int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+									/* if no transparent shadows, all light is blocked */
+									if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+										return true;
+									}
+									/* if maximum number of hits reached, block all light */
+									else if(*num_hits == max_hits) {
+										return true;
+									}
+
+									/* move on to next entry in intersections array */
+									isect_array++;
+									(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+
+									isect_array->t = isect_t;
+								}
+
+								prim_addr++;
+							} //while
+					} else {
+							kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+							int* nhiptr = &num_hits_in_instance;
+#else
+							int nhi= 0;
+							int *nhiptr = &nhi;
+#endif
+
+							int result = triangle_intersect8(kg,
+							                                 &isect_array,
+							                                 P,
+							                                 dir,
+							                                 PATH_RAY_SHADOW,
+							                                 object,
+							                                 prim_addr,
+							                                 prim_count,
+							                                 num_hits,
+							                                 max_hits,
+							                                 nhiptr,
+							                                 isect_t);
+							if(result == 2) {
+								return true;
+							}
+						}   // prim_count
+					}  // PRIMITIVE_TRIANGLE
+					else {
+							while(prim_addr < prim_addr2) {
+							kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+
+#ifdef __SHADOW_TRICKS__
+							uint tri_object = (object == OBJECT_NONE)
+									? kernel_tex_fetch(__prim_object, prim_addr)
+									: object;
+							if(tri_object == skip_object) {
+								++prim_addr;
+								continue;
+							}
+#endif
+
+							bool hit;
+
+							/* todo: specialized intersect functions which don't fill in
+							 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+							 * might give a few % performance improvement */
+
+							switch(p_type) {
+
+#if BVH_FEATURE(BVH_MOTION)
+								case PRIMITIVE_MOTION_TRIANGLE: {
+									hit = motion_triangle_intersect(kg,
+									                                isect_array,
+									                                P,
+									                                dir,
+									                                ray->time,
+									                                PATH_RAY_SHADOW,
+									                                object,
+									                                prim_addr);
+									break;
+								}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+								case PRIMITIVE_CURVE:
+								case PRIMITIVE_MOTION_CURVE: {
+									const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+									if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+										hit = cardinal_curve_intersect(kg,
+										                               isect_array,
+										                               P,
+										                               dir,
+										                               PATH_RAY_SHADOW,
+										                               object,
+										                               prim_addr,
+										                               ray->time,
+										                               curve_type,
+										                               NULL,
+										                               0, 0);
+									}
+									else {
+										hit = curve_intersect(kg,
+										                      isect_array,
+										                      P,
+										                      dir,
+										                      PATH_RAY_SHADOW,
+										                      object,
+										                      prim_addr,
+										                      ray->time,
+										                      curve_type,
+										                      NULL,
+										                      0, 0);
+									}
+									break;
+								}
+#endif
+								default: {
+									hit = false;
+									break;
+								}
+							}
+
+							/* Shadow ray early termination. */
+							if(hit) {
+								/* detect if this surface has a shader with transparent shadows */
+
+								/* todo: optimize so primitive visibility flag indicates if
+								 * the primitive has a transparent shadow shader? */
+								int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+								int shader = 0;
+
+#ifdef __HAIR__
+								if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+								{
+									shader = kernel_tex_fetch(__tri_shader, prim);
+								}
+#ifdef __HAIR__
+								else {
+									float4 str = kernel_tex_fetch(__curves, prim);
+									shader = __float_as_int(str.z);
+								}
+#endif
+								int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+								/* if no transparent shadows, all light is blocked */
+								if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+									return true;
+								}
+								/* if maximum number of hits reached, block all light */
+								else if(*num_hits == max_hits) {
+									return true;
+								}
+
+								/* move on to next entry in intersections array */
+								isect_array++;
+								(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+								num_hits_in_instance++;
+#endif
+
+								isect_array->t = isect_t;
+							}
+
+							prim_addr++;
+						}//while prim
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+#  else
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+#  endif
+
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+					obvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
+					tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+					idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+					node_addr = kernel_tex_fetch(__object_node, object);
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+#  endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
new file mode 100644
index 00000000000..2021d8e1143
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_traversal.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#  define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+	traversal_stack[0].dist = -FLT_MAX;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+	float node_dist = -FLT_MAX;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	BVH_DEBUG_INIT();
+	avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
+
+				if(UNLIKELY(node_dist > isect->t)
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
+#ifdef __VISIBILITY_FLAG__
+				   || (__float_as_uint(inodes.x) & visibility) == 0
+#endif
+				 )
+				{
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+
+				int child_mask;
+				avxf dist;
+
+				BVH_DEBUG_NEXT_NODE();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					child_mask = NODE_INTERSECT_ROBUST(kg,
+					                                   tnear,
+					                                   tfar,
+#  ifdef __KERNEL_AVX2__
+					                                   P_idir4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                                   org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+					                                   dir4,
+#  endif
+					                                   idir4,
+					                                   near_x, near_y, near_z,
+					                                   far_x, far_y, far_z,
+					                                   node_addr,
+					                                   difl,
+					                                   &dist);
+				}
+				else
+#endif  /* BVH_HAIR_MINIMUM_WIDTH */
+				{
+					child_mask = NODE_INTERSECT(kg,
+					                            tnear,
+					                            tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+				}
+
+				if(child_mask != 0) {
+					avxf cnodes;
+					/* TODO(sergey): Investigate whether moving cnodes upwards
+					 * gives a speedup (will be different cache pattern but will
+					 * avoid extra check here),
+					 */
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					float d0 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						node_dist = d0;
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							node_dist = d1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							node_dist = d0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					* stack items, continue with closest child.
+					*/
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				node_dist = traversal_stack[stack_ptr].dist;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+#ifdef __VISIBILITY_FLAG__
+				if(UNLIKELY((node_dist > isect->t) ||
+				            ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+				if(UNLIKELY((node_dist > isect->t)))
+#endif
+				{
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+					case PRIMITIVE_TRIANGLE: {
+						int prim_count = prim_addr2 - prim_addr;
+						if(prim_count < 3) {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								 BVH_DEBUG_NEXT_INTERSECTION();
+								 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								 if(triangle_intersect(kg,
+								                       isect,
+								                       P,
+								                       dir,
+								                       visibility,
+								                       object,
+								                       prim_addr))
+								 {
+									 tfar = avxf(isect->t);
+									 /* Shadow ray early termination. */
+									 if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										 return true;
+									 }
+								 }
+							}//for
+						}
+						else {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+							if(triangle_intersect8(kg,
+							                       &isect,
+							                       P,
+							                       dir,
+							                       visibility,
+							                       object,
+							                       prim_addr,
+							                       prim_count,
+							                       0,
+							                       0,
+							                       NULL,
+							                       0.0f))
+							{
+								tfar = avxf(isect->t);
+								if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									return true;
+								}
+							}
+						}//prim count
+						break;
+					}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_INTERSECTION();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								if(motion_triangle_intersect(kg,
+								                             isect,
+								                             P,
+								                             dir,
+								                             ray->time,
+								                             visibility,
+								                             object,
+								                             prim_addr))
+								{
+									tfar = avxf(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_INTERSECTION();
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
+								}
+								else {
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
+								}
+								if(hit) {
+									tfar = avxf(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
+#  else
+					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
+#  endif
+
+					obvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
+					tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+					idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+					traversal_stack[stack_ptr].dist = -FLT_MAX;
+
+					node_addr = kernel_tex_fetch(__object_node, object);
+
+					BVH_DEBUG_NEXT_INSTANCE();
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			node_dist = traversal_stack[stack_ptr].dist;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
new file mode 100644
index 00000000000..da9ddbd4f24
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_volume.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility)
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
+#ifdef __KERNEL_AVX2__
+				                                P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+#  if BVH_FEATURE(BVH_MOTION)
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+						obvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
+						tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+						idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+						node_addr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
new file mode 100644
index 00000000000..a88573e6f86
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_volume_all.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             const uint visibility)
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return 0;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+					                            tnear,
+					                            tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+
+				if(child_mask != 0) {
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+					bool hit;
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#  else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#  endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#  if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#  endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#  if BVH_FEATURE(BVH_INSTANCING)
+#    if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#    else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#    endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#  endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+#  if BVH_FEATURE(BVH_MOTION)
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+#  else
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+#  endif
+
+						obvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
+						tfar = avxf(isect_t);
+						idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+#  ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+						node_addr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+#  endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 3036efd4198..2e622af1758 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
 
 /* Axis-aligned nodes intersection */
 
-ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
                                                   const ssef& isect_near,
                                                   const ssef& isect_far,
 #ifdef __KERNEL_AVX2__
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index a2b1e050e58..f8c671fed14 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -1,4 +1,4 @@
-/*
+			/*
  * Copyright 2014, Blender Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -70,7 +70,479 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	return false;
 }
 
-/* Special ray intersection routines for local intersection. In that case we
+#define	cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
+#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
+ccl_device_inline
+#else
+ccl_device_forceinline
+#endif
+int ray_triangle_intersect8(KernelGlobals *kg,
+                            float3 ray_P,
+                            float3 ray_dir,
+                            Intersection **isect,
+                            uint visibility,
+                            int object,
+                            __m256 *triA,
+                            __m256 *triB,
+                            __m256 *triC,
+                            int prim_addr,
+                            int prim_num,
+                            uint *num_hits,
+                            uint max_hits,
+                            int *num_hits_in_instance,
+                            float isec_t)
+{
+
+	const unsigned char prim_num_mask = (1 << prim_num) - 1;
+
+	const __m256i zero256 = _mm256_setzero_si256();
+
+	const __m256 Px256 = _mm256_set1_ps(ray_P.x);
+	const __m256 Py256 = _mm256_set1_ps(ray_P.y);
+	const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
+
+	const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
+	const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
+	const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
+
+	/* Calculate vertices relative to ray origin. */
+	/*	const float3 v0 = tri_c - P;
+	const float3 v1 = tri_a - P;
+	const float3 v2 = tri_b - P; */
+
+	__m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
+	__m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
+	__m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
+
+	__m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
+	__m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
+	__m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
+
+	__m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
+	__m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
+	__m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
+
+	__m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
+	__m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
+	__m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
+
+	__m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
+	__m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
+	__m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
+
+	__m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
+	__m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
+	__m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
+
+	/* Calculate triangle edges.
+	const float3 e0 = v2 - v0;
+	const float3 e1 = v0 - v1;
+	const float3 e2 = v1 - v2;*/
+
+	__m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
+	__m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
+	__m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
+
+	__m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
+	__m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
+	__m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
+
+	__m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
+	__m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
+	__m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
+
+	/* Perform edge tests.
+	const float U = dot(cross(v2 + v0, e0), ray_dir);
+	const float V = dot(cross(v0 + v1, e1), ray_dir);
+	const float W = dot(cross(v1 + v2, e2), ray_dir);*/
+
+	//cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx)
+	__m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
+	__m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
+	__m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
+	//vertical dot
+	__m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
+	U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
+	U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
+
+	__m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
+	__m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
+	__m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
+	//vertical dot
+	__m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
+	V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
+	V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
+
+	__m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
+	__m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
+	__m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
+	//vertical dot
+	__m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
+	W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
+	W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
+
+	//const float minUVW = min(U, min(V, W));
+	//const float maxUVW = max(U, max(V, W));
+#if 0
+	__m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
+	__m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
+
+	//if(minUVW < 0.0f && maxUVW > 0.0f)
+	__m256i mask_minmaxUVW_256 = _mm256_and_si256(
+		_mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
+		//_mm256_castps_si256(minUVW_256),
+		_mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
+#else
+	__m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
+	__m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
+	__m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
+	__m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
+
+	const __m256i one256 = _mm256_set1_epi32(1);
+	const __m256i two256 = _mm256_set1_epi32(2);
+
+	__m256i mask_minmaxUVW_256 = _mm256_or_si256(
+		_mm256_cmpeq_epi32(one256, UVW_256_1),
+		_mm256_cmpeq_epi32(two256, UVW_256_1) );
+#endif
+
+	unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
+	if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
+		return false;
+	}
+
+	/* Calculate geometry normal and denominator. */
+	//			const float3 Ng1 = cross(e1, e0);
+	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+
+	__m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
+	__m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
+	__m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
+
+	//const float3 Ng = Ng1 + Ng1;
+	Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
+	Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
+	Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
+
+	//const float den = dot3(Ng, dir);
+	//vertical dot
+	__m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
+	den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
+	den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
+
+	// __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+
+	/* Perform depth test. */
+	//const float T = dot3(v0, Ng);
+	__m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
+	T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
+	T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
+
+	//const int sign_den = (__float_as_int(den) & 0x80000000);
+	const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
+	__m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
+
+	//const float sign_T = xor_signmask(T, sign_den);
+	__m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
+
+	/*if((sign_T < 0.0f) || mask_minmaxUVW_pos {	return false;}	*/
+	unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
+	if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
+		return false;
+	} /**/
+
+	__m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
+
+
+	ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
+	ccl_align(32) unsigned int mask_minmaxUVW8[8];
+
+	if(visibility == PATH_RAY_SHADOW_OPAQUE){
+			__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
+
+			__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+
+			__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
+			__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
+
+			__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
+				_mm256_castps_si256(
+					_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
+				)
+			);
+			/*	__m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
+			_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
+			_CMP_GT_OS
+			) );*/
+
+			mask0 = _mm256_or_si256(mask1, mask0);
+			//unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
+			//unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
+			//unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
+			mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
+			mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
+
+			unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
+			if((mask_final & prim_num_mask) == 0) { //all bits NOT set
+				return false;
+			}		/**/
+
+			unsigned long i = 0;
+#if defined(_MSC_VER)
+			unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
+#else
+            i = __builtin_ffs(mask_final)-1;
+#endif
+
+			den_256 = _mm256_rcp_ps(den_256); //inv_den
+			U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
+			V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
+			T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
+
+			_mm256_store_ps(U8, U_256);
+			_mm256_store_ps(V8, V_256);
+			_mm256_store_ps(T8, T_256);
+
+
+			//here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
+
+			(*isect)->u = U8[i];
+			(*isect)->v = V8[i];
+			(*isect)->t = T8[i];
+
+			(*isect)->prim = (prim_addr + i);
+			(*isect)->object = object;
+			(*isect)->type = PRIMITIVE_TRIANGLE;
+
+			return true;
+		}
+	else {
+			_mm256_store_ps(den8, den_256);
+			_mm256_store_ps(U8, U_256);
+			_mm256_store_ps(V8, V_256);
+			_mm256_store_ps(T8, T_256);
+
+			_mm256_store_ps(sign_T8, sign_T_256);
+			_mm256_store_ps(xor_signmask8, xor_signmask_256);
+			_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
+
+			int ret = false;
+
+			if(visibility == PATH_RAY_SHADOW) {
+				for(int i = 0; i < prim_num; i++) {
+					if(!mask_minmaxUVW8[i]) {
+#ifdef __VISIBILITY_FLAG__
+						if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+#endif
+						{
+							if((sign_T8[i] >= 0.0f) &&
+							   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
+							{
+								if(den8[i]) {
+									const float inv_den = 1.0f / den8[i];
+
+									(*isect)->u = U8[i] * inv_den;
+									(*isect)->v = V8[i] * inv_den;
+									(*isect)->t = T8[i] * inv_den;
+
+									(*isect)->prim = (prim_addr + i);
+									(*isect)->object = object;
+									(*isect)->type = PRIMITIVE_TRIANGLE;
+
+									int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
+									int shader = 0;
+
+#ifdef __HAIR__
+									if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+									{
+										shader = kernel_tex_fetch(__tri_shader, prim);
+									}
+#ifdef __HAIR__
+									else {
+										float4 str = kernel_tex_fetch(__curves, prim);
+										shader = __float_as_int(str.z);
+									}
+#endif
+									int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+									/* if no transparent shadows, all light is blocked */
+									if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+										return 2;
+									}
+									/* if maximum number of hits reached, block all light */
+									else if(*num_hits == max_hits) {
+										return 2;
+									}
+									/* move on to next entry in intersections array */
+									ret = true;
+
+									(*isect)++;
+									(*num_hits)++;
+
+									(*num_hits_in_instance)++;
+
+									(*isect)->t = isec_t;
+
+								} //den
+							} //if sign
+						} //vis
+					}//if mask
+				} //for
+		}
+		else { //default case
+			for(int i = 0; i < prim_num; i++) {
+				if(!mask_minmaxUVW8[i]) {
+#ifdef __VISIBILITY_FLAG__
+					if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+#endif
+					{
+						if((sign_T8[i] >= 0.0f) &&
+						   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
+						{
+							if(den8[i]) {
+								const float inv_den = 1.0f / den8[i];
+
+								(*isect)->u = U8[i] * inv_den;
+								(*isect)->v = V8[i] * inv_den;
+								(*isect)->t = T8[i] * inv_den;
+
+								(*isect)->prim = (prim_addr + i);
+								(*isect)->object = object;
+								(*isect)->type = PRIMITIVE_TRIANGLE;
+
+								ret = true;
+							} //den
+						} //if sign
+					} //vis
+				}//if mask
+			} //for
+		} //default
+	return ret;
+}// else PATH_RAY_SHADOW_OPAQUE
+
+}
+
+//vz static
+ccl_device_inline
+int triangle_intersect8(KernelGlobals *kg,
+                        Intersection **isect,
+                        float3 P,
+                        float3 dir,
+                        uint visibility,
+                        int object,
+                        int prim_addr,
+                        int prim_num,
+                        uint *num_hits,
+                        uint max_hits,
+                        int *num_hits_in_instance,
+                        float isec_t)
+ {
+	__m128 tri_a[8], tri_b[8], tri_c[8];
+	__m256  tritmp[12], tri[12];
+	__m256  triA[3], triB[3], triC[3];
+
+	int i, r;
+
+	uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+	for(i = 0; i < prim_num; i++) {
+		tri_a[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+		tri_b[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+		tri_c[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+	 }
+	//create 9 or  12 placeholders
+	tri[0] = _mm256_castps128_ps256(tri_a[0]);    //_mm256_zextps128_ps256
+	tri[1] = _mm256_castps128_ps256(tri_b[0]);//_mm256_zextps128_ps256
+	tri[2] = _mm256_castps128_ps256(tri_c[0]);//_mm256_zextps128_ps256
+
+	tri[3] = _mm256_castps128_ps256(tri_a[1]);    //_mm256_zextps128_ps256
+	tri[4] = _mm256_castps128_ps256(tri_b[1]);//_mm256_zextps128_ps256
+	tri[5] = _mm256_castps128_ps256(tri_c[1]);//_mm256_zextps128_ps256
+
+	tri[6] = _mm256_castps128_ps256(tri_a[2]);    //_mm256_zextps128_ps256
+	tri[7] = _mm256_castps128_ps256(tri_b[2]);//_mm256_zextps128_ps256
+	tri[8] = _mm256_castps128_ps256(tri_c[2]);//_mm256_zextps128_ps256
+
+	if(prim_num > 3) {
+		tri[9] =  _mm256_castps128_ps256(tri_a[3]);    //_mm256_zextps128_ps256
+		tri[10] = _mm256_castps128_ps256(tri_b[3]);//_mm256_zextps128_ps256
+		tri[11] = _mm256_castps128_ps256(tri_c[3]);//_mm256_zextps128_ps256
+	}
+
+	for(i = 4, r = 0; i < prim_num; i ++, r += 3) {
+		tri[r] =     _mm256_insertf128_ps(tri[r] , tri_a[i], 1);
+		tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
+		tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
+	 }
+
+	//------------------------------------------------
+	//0!  Xa0 Ya0 Za0 1 Xa4 Ya4 Za4  1
+	//1!  Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
+	//2!  Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
+
+	//3!  Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
+	//4!  Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5  1
+	//5!  Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
+
+	//6!  Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
+	//7!  Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6  1
+	//8!  Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
+
+	//9!  Xa3 Ya3 Za3 1 Xa7 Ya7 Za7  1
+	//10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7  1
+	//11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7  1
+
+	//"transpose"
+	tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]);   //0!  Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
+	tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]);   //1!  Za0 Za1 1   1   Za4 Za5  1   1
+
+	tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]);   //2!  Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
+	tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]);   //3!  Za2 Za3  1   1  Za6 Za7  1   1
+
+	tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]);   //4!  Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
+	tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]);   //5!  Zb0 Zb1  1  1   Zb4 Zb5  1   1
+
+	tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]);  //6!  Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
+	tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]);  //7!  Zb2 Zb3  1    1 Zb6 Zb7  1   1
+
+	tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]);   //8!  Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
+	tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]);   //9!  Zc0 Zc1  1   1  Zc4 Zc5  1   1
+
+	tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
+	tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); //11! Zc2 Zc3  1   1  Zc6 Zc7  1   1
+
+				/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+	triA[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); 	//  Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
+	triA[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2])));   //  Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
+	triA[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), _mm256_castps_pd(tritmp[3])));   //  Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
+
+	triB[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); 	//  Xb0 Xb1  Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
+	triB[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6])));   //  Yb0 Yb1  Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
+	triB[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), _mm256_castps_pd(tritmp[7]))); //    Zb0 Zb1  Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
+
+	triC[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10])));     //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
+	triC[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10])));     //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
+	triC[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), _mm256_castps_pd(tritmp[11])));     //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
+
+			  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+
+	int result = ray_triangle_intersect8(kg, P,
+	                                     dir,
+	                                     isect,
+	                                     visibility, object,
+	                                     triA,
+	                                     triB,
+	                                     triC,
+	                                     prim_addr,
+	                                     prim_num,
+	                                     num_hits,
+	                                     max_hits,
+	                                     num_hits_in_instance,
+	                                     isec_t);
+	return result;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point.
  * Returns whether traversal should be stopped.
@@ -83,7 +555,7 @@ ccl_device_inline bool triangle_intersect_local(
         float3 P,
         float3 dir,
         int object,
-		int local_object,
+        int local_object,
         int prim_addr,
         float tmax,
         uint *lcg_state,
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 2f955741797..aa7a16afa1d 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -71,15 +71,13 @@ CCL_NAMESPACE_BEGIN
 /* Texture types to be compatible with CUDA textures. These are really just
  * simple arrays and after inlining fetch hopefully revert to being a simple
  * pointer lookup. */
-
 template<typename T> struct texture  {
 	ccl_always_inline const T& fetch(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
 		return data[index];
 	}
-
-#ifdef __KERNEL_AVX__
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 	/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
 	 * compatibility with existing indicies and data structures.
 	 */
@@ -90,7 +88,6 @@ template<typename T> struct texture  {
 		ssef *ssef_node_data = &ssef_data[index];
 		return _mm256_loadu_ps((float *)ssef_node_data);
 	}
-
 #endif
 
 #ifdef __KERNEL_SSE2__
@@ -148,6 +145,10 @@ ccl_device_inline void print_sse3i(const char *label, sse3i& a)
 	print_ssei(label, a.z);
 }
 
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+#endif
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index d169915cff9..e93100a6442 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1384,8 +1384,9 @@ typedef enum KernelBVHLayout {
 
 	BVH_LAYOUT_BVH2 = (1 << 0),
 	BVH_LAYOUT_BVH4 = (1 << 1),
+	BVH_LAYOUT_BVH8 = (1 << 2),
 
-	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4,
+	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
 	BVH_LAYOUT_ALL = (unsigned int)(-1),
 } KernelBVHLayout;
 
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 508f44e7c4d..291f9a9fcae 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -78,6 +78,7 @@ set(SRC_HEADERS
 	util_sky_model.h
 	util_sky_model_data.h
 	util_avxf.h
+	util_avxb.h
 	util_sseb.h
 	util_ssef.h
 	util_ssei.h
@@ -98,7 +99,9 @@ set(SRC_HEADERS
 	util_types_float3_impl.h
 	util_types_float4.h
 	util_types_float4_impl.h
-	util_types_int2.h
+	util_types_float8.h
+	util_types_float8_impl.h
+		util_types_int2.h
 	util_types_int2_impl.h
 	util_types_int3.h
 	util_types_int3_impl.h
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
new file mode 100644
index 00000000000..865549be283
--- /dev/null
+++ b/intern/cycles/util/util_avxb.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxf;
+
+/*! 4-wide SSE bool type. */
+struct avxb
+{
+	typedef avxb Mask;                    // mask type
+	typedef avxf Float;                   // float type
+
+	enum   { size = 8 };                  // number of SIMD elements
+	union  { __m256 m256; int32_t v[8]; };  // data
+
+	////////////////////////////////////////////////////////////////////////////////
+	/// Constructors, Assignment & Cast Operators
+	////////////////////////////////////////////////////////////////////////////////
+	
+	__forceinline avxb           ( ) {}
+	__forceinline avxb           ( const avxb& other ) { m256 = other.m256; }
+	__forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; }
+
+	__forceinline avxb( const __m256  input ) : m256(input) {}
+	__forceinline operator const __m256&( void ) const { return m256; }
+	__forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
+	__forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
+	
+	//__forceinline avxb           ( bool  a )
+	//	: m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+	//__forceinline avxb           ( bool  a, bool  b) 
+	//	: m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+	//__forceinline avxb           ( bool  a, bool  b, bool  c, bool  d)
+	//	: m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+	//__forceinline avxb(int mask) {
+	//	assert(mask >= 0 && mask < 16);
+	//	m128 = _mm_lookupmask_ps[mask];
+	//}
+
+	////////////////////////////////////////////////////////////////////////////////
+	/// Constants
+	////////////////////////////////////////////////////////////////////////////////
+
+	__forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
+	__forceinline avxb( TrueTy  ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
+
+	////////////////////////////////////////////////////////////////////////////////
+	/// Array Access
+	////////////////////////////////////////////////////////////////////////////////
+
+	__forceinline bool   operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
+	__forceinline int32_t& operator []( const size_t i )       { assert(i < 8); return v[i]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
+__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
+__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
+__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
+__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+
+__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
+#if defined(__KERNEL_SSE41__)
+	return _mm256_blendv_ps(f, t, m); 
+#else
+	return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f)); 
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
+__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
+
+#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
+                                                      ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
+__forceinline const avxb shuffle( const avxb& a ) {
+	return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0)));
+}
+
+/*
+template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
+	return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
+	return _mm_movehl_ps(a, a);
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
+	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
+	return _mm_movelh_ps(a, b);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
+	return _mm_movehl_ps(b, a);
+}
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
+template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
+#endif
+
+#if defined(__KERNEL_SSE41__)
+template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
+template<size_t dst>             __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
+#endif
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reduction Operations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
+#else
+__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+
+                                                      bool(a[5])+bool(a[6])+bool(a[7]); }
+#endif
+
+__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; }
+__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; }
+__forceinline bool all       ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; }
+__forceinline bool any       ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; }
+__forceinline bool none      ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; }
+
+__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxb(const char *label, const avxb &a)
+{
+	printf("%s: %df %df %df %df %df %df %df %d\n",
+	       label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+//#endif
+
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 2451213963a..81365f96c94 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -19,7 +19,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_AVX__
+struct avxb;
+
 struct avxf
 {
 	typedef avxf Float;
@@ -53,6 +54,9 @@ struct avxf
 	__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
 		m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
 
+	__forceinline avxf(float3 a) :
+		m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {}
+
 
 	__forceinline avxf(int a3, int a2, int a1, int a0)
 	{
@@ -73,8 +77,24 @@ struct avxf
 		m256 = _mm256_insertf128_ps(foo, b, 1);
 	}
 
+	__forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; }
+	__forceinline       float& operator [](const size_t i) { assert(i < 8); return f[i]; }
 };
 
+__forceinline avxf cross(const avxf& a, const avxf& b)
+{
+	avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5],
+		   0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]);
+	return r;
+}
+
+__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2)
+{
+	const avxf t = _mm256_mul_ps(a.m256, b.m256);
+	den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+	den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6];
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Unary Operators
 ////////////////////////////////////////////////////////////////////////////////
@@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256
 
 __forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
 
+__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); }
+__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); }
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Movement/Shifting/Shuffling Functions
 ////////////////////////////////////////////////////////////////////////////////
@@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
 	return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
 }
 
+//#if defined(__KERNEL_SSE41__)
+__forceinline avxf maxi(const avxf& a, const avxf& b) {
+	const avxf ci = _mm256_max_ps(a, b);
+	return ci;
+}
+
+__forceinline avxf mini(const avxf& a, const avxf& b) {
+	const avxf ci = _mm256_min_ps(a, b);
+	return ci;
+}
+//#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Ternary Operators
 ////////////////////////////////////////////////////////////////////////////////
@@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
 	return c-(a*b);
 #endif
 }
+__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
+	return _mm256_fmsub_ps(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __KERNEL_AVX2__
+__forceinline const avxb operator <=(const avxf& a, const avxf& b) {
+	return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
+}
+#endif
+
 #endif
 
 #ifndef _mm256_set_m128
@@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
 
 CCL_NAMESPACE_END
 
-#endif
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 9a66a372822..a761c9c46de 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -57,7 +57,19 @@ void DebugFlags::CPU::reset()
 #undef STRINGIFY
 #undef CHECK_CPU_FLAGS
 
-	bvh_layout = BVH_LAYOUT_DEFAULT;
+	if (getenv("CYCLES_BVH2") != NULL) {
+		bvh_layout = BVH_LAYOUT_BVH2;
+	}
+	else if (getenv("CYCLES_BVH4") != NULL) {
+		bvh_layout = BVH_LAYOUT_BVH4;
+	}
+	else if (getenv("CYCLES_BVH8") != NULL) {
+		bvh_layout = BVH_LAYOUT_BVH8;
+	}
+	else {
+		bvh_layout = BVH_LAYOUT_DEFAULT;
+	}
+
 	split_kernel = false;
 }
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index dfe755a8789..96c549b9be5 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -121,6 +121,7 @@ CCL_NAMESPACE_END
 #include "util/util_types_float2.h"
 #include "util/util_types_float3.h"
 #include "util/util_types_float4.h"
+#include "util/util_types_float8.h"
 
 #include "util/util_types_vector3.h"
 
@@ -140,6 +141,7 @@ CCL_NAMESPACE_END
 #include "util/util_types_float2_impl.h"
 #include "util/util_types_float3_impl.h"
 #include "util/util_types_float4_impl.h"
+#include "util/util_types_float8_impl.h"
 
 #include "util/util_types_vector3_impl.h"
 
@@ -148,7 +150,10 @@ CCL_NAMESPACE_END
 #  include "util/util_sseb.h"
 #  include "util/util_ssei.h"
 #  include "util/util_ssef.h"
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+#  include "util/util_avxb.h"
 #  include "util/util_avxf.h"
 #endif
+#endif
 
 #endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h
new file mode 100644
index 00000000000..2e693d49634
--- /dev/null
+++ b/intern/cycles/util/util_types_float8.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_H__
+#define __UTIL_TYPES_FLOAT8_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct ccl_try_align(16) float8 {
+#ifdef __KERNEL_AVX2__
+	union {
+		__m256 m256;
+		struct { float a, b, c, d, e, f, g, h; };
+	};
+
+	__forceinline float8();
+	__forceinline float8(const float8& a);
+	__forceinline explicit float8(const __m256& a);
+
+	__forceinline operator const __m256&(void) const;
+	__forceinline operator __m256&(void);
+
+	__forceinline float8& operator =(const float8& a);
+
+#else  /* __KERNEL_AVX2__ */
+	float a, b, c, d, e, f, g, h;
+#endif  /* __KERNEL_AVX2__ */
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float8 make_float8(float f);
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+                                     float e, float f, float g, float h);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT8_H__ */
diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h
new file mode 100644
index 00000000000..4fac03569e9
--- /dev/null
+++ b/intern/cycles/util/util_types_float8_impl.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
+#define __UTIL_TYPES_FLOAT8_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_AVX2__
+__forceinline float8::float8()
+{
+}
+
+__forceinline float8::float8(const float8& f)
+        : m256(f.m256)
+{
+}
+
+__forceinline float8::float8(const __m256& f)
+        : m256(f)
+{
+}
+
+__forceinline float8::operator const __m256&(void) const
+{
+	return m256;
+}
+
+__forceinline float8::operator __m256&(void)
+{
+	return m256;
+}
+
+__forceinline float8& float8::operator =(const float8& f)
+{
+	m256 = f.m256;
+	return *this;
+}
+#endif  /* __KERNEL_AVX2__ */
+
+__forceinline float float8::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 8);
+	return *(&a + i);
+}
+
+__forceinline float& float8::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 8);
+	return *(&a + i);
+}
+
+ccl_device_inline float8 make_float8(float f)
+{
+#ifdef __KERNEL_AVX2__
+	float8 r(_mm256_set1_ps(f));
+#else
+	float8 r = {f, f, f, f, f, f, f, f};
+#endif
+	return r;
+}
+
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+                                     float e, float f, float g, float h)
+{
+#ifdef __KERNEL_AVX2__
+	float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
+#else
+	float8 r = {a, b, c, d, e, f, g, h};
+#endif
+	return r;
+}
+
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT8_IMPL_H__ */
author	Sergey Sharybin <sergey.vfx@gmail.com>	2018-02-14 13:23:30 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2018-08-29 16:03:09 +0300
commit	73f20560529457ea177cb93e8e8eaaf44a589643 (patch)
tree	45ea2ebad9adabcedd7833629421909ede9f6fb5 /intern/cycles
parent	66f8a4c07e8a5fc166579101933264b8425a7cd1 (diff)