37 files changed, 5079 insertions, 65 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index c97d942af9d..80b83c94012 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -50,6 +50,7 @@ enum_displacement_methods = (
 enum_bvh_layouts = (
     ('BVH2', "BVH2", "", 1),
     ('BVH4', "BVH4", "", 2),
+    ('BVH8', "BVH8", "", 4),
 )
 
 enum_bvh_types = (
@@ -686,7 +687,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_bvh_layout = EnumProperty(
             name="BVH Layout",
             items=enum_bvh_layouts,
-            default='BVH4',
+            default='BVH8',
         )
         cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index fedcb874269..fcdadf0ad6e 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -679,7 +679,15 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 		params.texture_limit = 0;
 	}
 
-	params.bvh_layout = DebugFlags().cpu.bvh_layout;
+	/* TODO(sergey): Once OSL supports per-microarchitecture optimization get
+	 * rid of this.
+	 */
+	if (params.shadingsystem == SHADINGSYSTEM_OSL) {
+		params.bvh_layout = BVH_LAYOUT_BVH4;
+	}
+	else {
+		params.bvh_layout = DebugFlags().cpu.bvh_layout;
+	}
 
 	return params;
 }
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index b8171e7f70d..fcd28572fdf 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -10,6 +10,7 @@ set(SRC
 	bvh.cpp
 	bvh2.cpp
 	bvh4.cpp
+	bvh8.cpp
 	bvh_binning.cpp
 	bvh_build.cpp
 	bvh_node.cpp
@@ -22,6 +23,7 @@ set(SRC_HEADERS
 	bvh.h
 	bvh2.h
 	bvh4.h
+	bvh8.h
 	bvh_binning.h
 	bvh_build.h
 	bvh_node.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index b524ca07d8d..bc73a3ad264 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -22,6 +22,7 @@
 
 #include "bvh/bvh2.h"
 #include "bvh/bvh4.h"
+#include "bvh/bvh8.h"
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_node.h"
 
@@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout)
 	switch(layout) {
 		case BVH_LAYOUT_BVH2: return "BVH2";
 		case BVH_LAYOUT_BVH4: return "BVH4";
+		case BVH_LAYOUT_BVH8: return "BVH8";
 		case BVH_LAYOUT_NONE: return "NONE";
 		case BVH_LAYOUT_ALL:  return "ALL";
 	}
@@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 			return new BVH2(params, objects);
 		case BVH_LAYOUT_BVH4:
 			return new BVH4(params, objects);
+		case BVH_LAYOUT_BVH8:
+			return new BVH8(params, objects);
 		case BVH_LAYOUT_NONE:
 		case BVH_LAYOUT_ALL:
 			break;
@@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
 			}
 		}
 		visibility |= ob->visibility_for_tracing();
+
+	}
+}
+
+bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh)
+{
+	if(node->is_leaf()) {
+		return node->is_unaligned;
+	}
+	else {
+		return node_is_unaligned(node, bvh);
+	}
+}
+
+bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh)
+{
+	const BVHNode *node0 = node->get_child(0);
+	const BVHNode *node1 = node->get_child(1);
+
+	switch(bvh) {
+		case bvh2:
+			return node0->is_unaligned || node1->is_unaligned;
+			break;
+		case bvh4:
+			return leaf_check(node0, bvh2) || leaf_check(node1, bvh2);
+			break;
+		case bvh8:
+			return leaf_check(node0, bvh4) || leaf_check(node1, bvh4);
+			break;
+		default:
+			assert(0);
+			return false;
 	}
 }
 
@@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	 * BVH's are stored in global arrays. This function merges them into the
 	 * top level BVH, adjusting indexes and offsets where appropriate.
 	 */
-	/* TODO(sergey): This code needs adjustment for wider BVH than 4. */
 	const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
+	const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
 
 	/* Adjust primitive index to point to the triangle in the global array, for
 	 * meshes with transform applied and already in the top level BVH.
@@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
 				size_t nsize, nsize_bbox;
 				if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
-					nsize = use_qbvh
-					            ? BVH_UNALIGNED_QNODE_SIZE
-					            : BVH_UNALIGNED_NODE_SIZE;
-					nsize_bbox = (use_qbvh)? 13: 0;
+					if(use_obvh) {
+						nsize = BVH_UNALIGNED_ONODE_SIZE;
+						nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1;
+					}
+					else {
+						nsize = use_qbvh
+							? BVH_UNALIGNED_QNODE_SIZE
+							: BVH_UNALIGNED_NODE_SIZE;
+						nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0;
+					}
 				}
 				else {
-					nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
-					nsize_bbox = (use_qbvh)? 7: 0;
+					if(use_obvh) {
+						nsize = BVH_ONODE_SIZE;
+						nsize_bbox = BVH_ONODE_SIZE-1;
+					}
+					else {
+						nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+						nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0;
+					}
 				}
 
 				memcpy(pack_nodes + pack_nodes_offset,
@@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
 				/* Modify offsets into arrays */
 				int4 data = bvh_nodes[i + nsize_bbox];
-
-				data.z += (data.z < 0)? -noffset_leaf: noffset;
-				data.w += (data.w < 0)? -noffset_leaf: noffset;
-
-				if(use_qbvh) {
-					data.x += (data.x < 0)? -noffset_leaf: noffset;
-					data.y += (data.y < 0)? -noffset_leaf: noffset;
+				int4 data1 = bvh_nodes[i + nsize_bbox-1];
+				if(use_obvh) {
+					data.z += (data.z < 0) ? -noffset_leaf : noffset;
+					data.w += (data.w < 0) ? -noffset_leaf : noffset;
+					data.x += (data.x < 0) ? -noffset_leaf : noffset;
+					data.y += (data.y < 0) ? -noffset_leaf : noffset;
+					data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
+					data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
+					data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
+					data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
+				}
+				else {
+					data.z += (data.z < 0) ? -noffset_leaf : noffset;
+					data.w += (data.w < 0) ? -noffset_leaf : noffset;
+					if(use_qbvh) {
+						data.x += (data.x < 0)? -noffset_leaf: noffset;
+						data.y += (data.y < 0)? -noffset_leaf: noffset;
+					}
 				}
-
 				pack_nodes[pack_nodes_offset + nsize_bbox] = data;
+				if(use_obvh) {
+					pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
+				}
 
 				/* Usually this copies nothing, but we better
 				 * be prepared for possible node size extension.
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 6a82f915692..86be0bae4be 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -73,6 +73,12 @@ struct PackedBVH {
 	}
 };
 
+enum BVH_TYPE {
+	bvh2,
+	bvh4,
+	bvh8
+};
+
 /* BVH */
 
 class BVH
@@ -93,6 +99,8 @@ protected:
 
 	/* Refit range of primitives. */
 	void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
+	static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh);
+	static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh);
 
 	/* triangles and strands */
 	void pack_primitives();
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
index 9d89d2b6afb..4a423c16559 100644
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -25,13 +25,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	return node0->is_unaligned || node1->is_unaligned;
-}
-
 BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
@@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root)
 	}
 	else {
 		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_bvh_is_unaligned(root)
+		nextNodeIdx += node_is_unaligned(root, bvh2)
 		                       ? BVH_UNALIGNED_NODE_SIZE
 		                       : BVH_NODE_SIZE;
 	}
@@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root)
 				}
 				else {
 					idx[i] = nextNodeIdx;
-					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+					nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2)
 					                       ? BVH_UNALIGNED_NODE_SIZE
 					                       : BVH_NODE_SIZE;
 				}
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
index 4faf47af7bb..a449587e607 100644
--- a/intern/cycles/bvh/bvh4.cpp
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN
  * Perhaps we can merge nodes in actual tree and make our
  * life easier all over the place.
  */
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
-	const BVHNode *node0 = node->get_child(0),
-	              *node1 = node->get_child(1);
-	bool has_unaligned = false;
-	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned;
-	}
-	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned;
-		has_unaligned |= node0->get_child(1)->is_unaligned;
-	}
-	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned;
-	}
-	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned;
-		has_unaligned |= node1->get_child(1)->is_unaligned;
-	}
-	return has_unaligned;
-}
 
 BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
@@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root)
 	}
 	else {
 		stack.push_back(BVHStackEntry(root, nextNodeIdx));
-		nextNodeIdx += node_qbvh_is_unaligned(root)
+		nextNodeIdx += node_is_unaligned(root, bvh4)
 		                       ? BVH_UNALIGNED_QNODE_SIZE
 		                       : BVH_QNODE_SIZE;
 	}
@@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root)
 				}
 				else {
 					idx = nextNodeIdx;
-					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+					nextNodeIdx += node_is_unaligned(nodes[i], bvh4)
 					                       ? BVH_UNALIGNED_QNODE_SIZE
 					                       : BVH_QNODE_SIZE;
 				}
@@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			                    visibility,
 			                    0.0f,
 			                    1.0f,
-			                    4);
+			                    num_nodes);
 		}
 		else {
 			pack_aligned_node(idx,
@@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			                  visibility,
 			                  0.0f,
 			                  1.0f,
-			                  4);
+			                  num_nodes);
 		}
 	}
 }
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
new file mode 100644
index 00000000000..e8b6726602c
--- /dev/null
+++ b/intern/cycles/bvh/bvh8.cpp
@@ -0,0 +1,515 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "bvh/bvh8.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+	float4 data[BVH_ONODE_LEAF_SIZE];
+	memset(data, 0, sizeof(data));
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+		/* object */
+		data[0].x = __int_as_float(~(leaf->lo));
+		data[0].y = __int_as_float(0);
+	}
+	else {
+		/* triangle */
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
+	}
+	data[0].z = __uint_as_float(leaf->visibility);
+	if(leaf->num_triangles() != 0) {
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+	}
+
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+}
+
+void BVH8::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry *en,
+                      int num)
+{
+	bool has_unaligned = false;
+	/* Check whether we have to create unaligned node or all nodes are aligned
+	 * and we can cut some corner here.
+	 */
+	if(params.use_unaligned_nodes) {
+		for(int i = 0; i < num; i++) {
+			if(en[i].node->is_unaligned) {
+				has_unaligned = true;
+				break;
+			}
+		}
+	}
+	if(has_unaligned) {
+		/* There's no unaligned children, pack into AABB node. */
+		pack_unaligned_inner(e, en, num);
+	}
+	else {
+		/* Create unaligned node with orientation transform for each of the
+		 * children.
+		 */
+		pack_aligned_inner(e, en, num);
+	}
+}
+
+void BVH8::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry *en,
+                              int num)
+{
+	BoundBox bounds[8];
+	int child[8];
+	for(int i = 0; i < num; ++i) {
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_aligned_node(e.idx,
+	                  bounds,
+	                  child,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
+	                  num);
+}
+
+void BVH8::pack_aligned_node(int idx,
+                             const BoundBox *bounds,
+                             const int *child,
+                             const uint visibility,
+                             const float time_from,
+                             const float time_to,
+                             const int num)
+{
+	float8 data[8];
+	memset(data, 0, sizeof(data));
+
+	data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+	data[0].b = time_from;
+	data[0].c = time_to;
+	for(int i = 0; i < num; i++) {
+		float3 bb_min = bounds[i].min;
+		float3 bb_max = bounds[i].max;
+
+		data[1][i] = bb_min.x;
+		data[2][i] = bb_max.x;
+		data[3][i] = bb_min.y;
+		data[4][i] = bb_max.y;
+		data[5][i] = bb_min.z;
+		data[6][i] = bb_max.z;
+
+		data[7][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 8; i++) {
+		/* We store BB which would never be recorded as intersection
+		* so kernel might safely assume there are always 4 child nodes.
+		*/
+		data[1][i] = FLT_MAX;
+		data[2][i] = -FLT_MAX;
+
+		data[3][i] = FLT_MAX;
+		data[4][i] = -FLT_MAX;
+
+		data[5][i] = FLT_MAX;
+		data[6][i] = -FLT_MAX;
+
+		data[7][i] = __int_as_float(0);
+	}
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
+}
+
+void BVH8::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry *en,
+                                int num)
+{
+	Transform aligned_space[8];
+	BoundBox bounds[8];
+	int child[8];
+	for(int i = 0; i < num; ++i) {
+		aligned_space[i] = en[i].node->get_aligned_space();
+		bounds[i] = en[i].node->bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_unaligned_node(e.idx,
+	                    aligned_space,
+	                    bounds,
+	                    child,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
+	                    num);
+}
+
+void BVH8::pack_unaligned_node(int idx,
+                               const Transform *aligned_space,
+                               const BoundBox *bounds,
+                               const int *child,
+                               const uint visibility,
+                               const float time_from,
+                               const float time_to,
+                               const int num)
+{
+	float8 data[BVH_UNALIGNED_ONODE_SIZE];
+	memset(data, 0, sizeof(data));
+	data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+	data[0].b = time_from;
+	data[0].c = time_to;
+
+	for(int i = 0; i < num; i++) {
+		Transform space = BVHUnaligned::compute_node_transform(
+		        bounds[i],
+		        aligned_space[i]);
+
+		data[1][i] = space.x.x;
+		data[2][i] = space.x.y;
+		data[3][i] = space.x.z;
+
+		data[4][i] = space.y.x;
+		data[5][i] = space.y.y;
+		data[6][i] = space.y.z;
+
+		data[7][i] = space.z.x;
+		data[8][i] = space.z.y;
+		data[9][i] = space.z.z;
+
+		data[10][i] = space.x.w;
+		data[11][i] = space.y.w;
+		data[12][i] = space.z.w;
+
+		data[13][i] = __int_as_float(child[i]);
+	}
+
+	for(int i = num; i < 8; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+
+		data[1][i] = 1.0f;
+		data[2][i] = 0.0f;
+		data[3][i] = 0.0f;
+
+		data[4][i] = 0.0f;
+		data[5][i] = 0.0f;
+		data[6][i] = 0.0f;
+
+		data[7][i] = 0.0f;
+		data[8][i] = 0.0f;
+		data[9][i] = 0.0f;
+
+		data[10][i] = -FLT_MAX;
+		data[11][i] = -FLT_MAX;
+		data[12][i] = -FLT_MAX;
+
+		data[13][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH8::pack_nodes(const BVHNode *root)
+{
+	/* Calculate size of the arrays required. */
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
+		        (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_ONODE_SIZE;
+	}
+	/* Resize arrays. */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
+	if(params.top_level) {
+		pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+	}
+	else {
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+	}
+
+	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+	vector<BVHStackEntry> stack;
+	stack.reserve(BVHParams::MAX_DEPTH*2);
+	if(root->is_leaf()) {
+		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_is_unaligned(root, bvh8)
+		                   ? BVH_UNALIGNED_ONODE_SIZE
+		                   : BVH_ONODE_SIZE;
+	}
+
+	while(stack.size()) {
+		BVHStackEntry e = stack.back();
+		stack.pop_back();
+
+		if(e.node->is_leaf()) {
+			/* leaf node */
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+			pack_leaf(e, leaf);
+		}
+		else {
+			/* Inner node. */
+			const BVHNode *node = e.node;
+			const BVHNode *node0 = node->get_child(0);
+			const BVHNode *node1 = node->get_child(1);
+			/* Collect nodes. */
+			const BVHNode *nodes[8];
+			int numnodes = 0;
+			if(node0->is_leaf()) {
+				nodes[numnodes++] = node0;
+			}
+			else {
+				const BVHNode *node00 = node0->get_child(0),
+				              *node01 = node0->get_child(1);
+				if(node00->is_leaf()) {
+					nodes[numnodes++] = node00;
+				}
+				else {
+					nodes[numnodes++] = node00->get_child(0);
+					nodes[numnodes++] = node00->get_child(1);
+				}
+				if(node01->is_leaf()) {
+					nodes[numnodes++] = node01;
+				}
+				else {
+					nodes[numnodes++] = node01->get_child(0);
+					nodes[numnodes++] = node01->get_child(1);
+				}
+			}
+			if(node1->is_leaf()) {
+				nodes[numnodes++] = node1;
+			}
+			else {
+				const BVHNode *node10 = node1->get_child(0),
+				              *node11 = node1->get_child(1);
+				if(node10->is_leaf()) {
+					nodes[numnodes++] = node10;
+				}
+				else {
+					nodes[numnodes++] = node10->get_child(0);
+					nodes[numnodes++] = node10->get_child(1);
+				}
+				if(node11->is_leaf()) {
+					nodes[numnodes++] = node11;
+				}
+				else {
+					nodes[numnodes++] = node11->get_child(0);
+					nodes[numnodes++] = node11->get_child(1);
+				}
+			}
+			/* Push entries on the stack. */
+			for(int i = 0; i < numnodes; ++i) {
+				int idx;
+				if(nodes[i]->is_leaf()) {
+					idx = nextLeafNodeIdx++;
+				}
+				else {
+					idx = nextNodeIdx;
+					nextNodeIdx += node_is_unaligned(nodes[i], bvh8)
+						? BVH_UNALIGNED_ONODE_SIZE
+						: BVH_ONODE_SIZE; 
+				}
+				stack.push_back(BVHStackEntry(nodes[i], idx));
+			}
+			/* Set node. */
+			pack_inner(e, &stack[stack.size() - numnodes], numnodes);
+		}
+	}
+	assert(node_size == nextNodeIdx);
+	/* Root index to start traversal at, to handle case of single leaf node. */
+	pack.root_index = (root->is_leaf()) ? -1 : 0;
+}
+
+void BVH8::refit_nodes()
+{
+	assert(!params.top_level);
+
+	BoundBox bbox = BoundBox::empty;
+	uint visibility = 0;
+	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+	if(leaf) {
+		int4 *data = &pack.leaf_nodes[idx];
+		int4 c = data[0];
+		/* Refit leaf node. */
+		for(int prim = c.x; prim < c.y; prim++) {
+			int pidx = pack.prim_index[prim];
+			int tob = pack.prim_object[prim];
+			Object *ob = objects[tob];
+
+			if(pidx == -1) {
+				/* Object instance. */
+				bbox.grow(ob->bounds);
+			}
+			else {
+				/* Primitives. */
+				const Mesh *mesh = ob->mesh;
+
+				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+					/* Curves. */
+					int str_offset = (params.top_level) ? mesh->curve_offset : 0;
+					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+					visibility |= PATH_RAY_CURVE;
+
+					/* Motion curves. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->curve_keys.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *key_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++) {
+								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+							}
+						}
+					}
+				}
+				else {
+					/* Triangles. */
+					int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
+					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+					const float3 *vpos = &mesh->verts[0];
+
+					triangle.bounds_grow(vpos, bbox);
+
+					/* Motion triangles. */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->verts.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *vert_steps = attr->data_float3();
+
+							for(size_t i = 0; i < steps; i++) {
+								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+							}
+						}
+					}
+				}
+			}
+
+			visibility |= ob->visibility;
+		}
+
+		float4 leaf_data[BVH_ONODE_LEAF_SIZE];
+		leaf_data[0].x = __int_as_float(c.x);
+		leaf_data[0].y = __int_as_float(c.y);
+		leaf_data[0].z = __uint_as_float(visibility);
+		leaf_data[0].w = __uint_as_float(c.w);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+	}
+	else {
+		int4 *data = &pack.nodes[idx];
+		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		int4 c;
+		if(is_unaligned) {
+			c = data[BVH_UNALIGNED_ONODE_SIZE-1];
+		}
+		else {
+			c = data[BVH_ONODE_SIZE-1];
+		}
+		/* Refit inner node, set bbox from children. */
+		BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty,
+		                           BoundBox::empty, BoundBox::empty,
+		                           BoundBox::empty, BoundBox::empty,
+		                           BoundBox::empty, BoundBox::empty };
+		uint child_visibility[8] = { 0 };
+		int num_nodes = 0;
+
+		for(int i = 0; i < 8; ++i) {
+			if(c[i] != 0) {
+				refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+				           child_bbox[i], child_visibility[i]);
+				++num_nodes;
+				bbox.grow(child_bbox[i]);
+				visibility |= child_visibility[i];
+			}
+		}
+
+		if(is_unaligned) {
+			Transform aligned_space[8] = { transform_identity(), transform_identity(),
+			                               transform_identity(), transform_identity(),
+			                               transform_identity(), transform_identity(),
+			                               transform_identity(), transform_identity()};
+			pack_unaligned_node(idx,
+			                    aligned_space,
+			                    child_bbox,
+			                    &c[0],
+			                    visibility,
+			                    0.0f,
+			                    1.0f,
+			                    num_nodes);
+		}
+		else {
+			pack_aligned_node(idx,
+			                  child_bbox,
+			                  &c[0],
+			                  visibility,
+			                  0.0f,
+			                  1.0f,
+			                  num_nodes);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
new file mode 100644
index 00000000000..99ff79e60b5
--- /dev/null
+++ b/intern/cycles/bvh/bvh8.h
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __BVH8_H__
+#define __BVH8_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_ONODE_SIZE           16
+#define BVH_ONODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_ONODE_SIZE 28
+
+/* BVH8
+*
+* Octo BVH, with each node having eight children, to use with SIMD instructions.
+*/
+class BVH8 : public BVH {
+protected:
+	/* constructor */
+	friend class BVH;
+	BVH8(const BVHParams& params, const vector<Object*>& objects);
+
+	/* pack */
+	void pack_nodes(const BVHNode *root);
+
+	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry *en,
+	                        int num);
+	void pack_aligned_node(int idx,
+	                       const BoundBox *bounds,
+	                       const int *child,
+	                       const uint visibility,
+	                       const float time_from,
+	                       const float time_to,
+	                       const int num);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry *en,
+	                          int num);
+	void pack_unaligned_node(int idx,
+	                         const Transform *aligned_space,
+	                         const BoundBox *bounds,
+	                         const int *child,
+	                         const uint visibility,
+	                         const float time_from,
+	                         const float time_to,
+	                         const int num);
+
+	/* refit */
+	void refit_nodes();
+	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH8_H__ */
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 879d07b9625..fd8b4977699 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				}
 			}
 			return cnt;
+		case BVH_STAT_ONODE_COUNT:
+			cnt = 1;
+			for(int i = 0; i < num_children(); i++) {
+				BVHNode *node = get_child(i);
+				if(node->is_leaf()) {
+					cnt += 1;
+				}
+				else {
+					for(int j = 0; j < node->num_children(); j++)
+					{
+						BVHNode *node_next = node->get_child(j);
+						if(node_next->is_leaf()) {
+							cnt += 1;
+						}
+						else {
+							for(int k = 0; k < node_next->num_children(); k++) {
+								cnt += node_next->get_child(k)->getSubtreeSize(stat);
+							}
+						}
+					}
+				}
+			}
+			return cnt;
+		case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT:
+			{
+				bool has_unaligned = false;
+				for(int i = 0; i < num_children(); i++) {
+					BVHNode *node = get_child(i);
+					if(node->is_leaf()) {
+						has_unaligned |= node->is_unaligned;
+					}
+					else {
+						for(int j = 0; j < node->num_children(); j++) {
+							BVHNode *node_next = node->get_child(j);
+							if(node_next->is_leaf()) {
+								has_unaligned |= node_next->is_unaligned;
+							}
+							else {
+								for(int k = 0; k < node_next->num_children(); k++) {
+									cnt += node_next->get_child(k)->getSubtreeSize(stat);
+									has_unaligned |= node_next->get_child(k)->is_unaligned;
+								}
+							}
+						}
+					}
+				}
+				cnt += has_unaligned? 1: 0;
+			}
+			return cnt;
 		case BVH_STAT_ALIGNED_COUNT:
 			if(!is_unaligned) {
 				cnt = 1;
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 94cf5ab730c..ed89d52a50a 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -39,6 +39,8 @@ enum BVH_STAT {
 	BVH_STAT_ALIGNED_LEAF_COUNT,
 	BVH_STAT_UNALIGNED_LEAF_COUNT,
 	BVH_STAT_DEPTH,
+	BVH_STAT_ONODE_COUNT,
+	BVH_STAT_UNALIGNED_INNER_ONODE_COUNT,
 };
 
 class BVHParams;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index e92bbbfa6e6..5b3761f8353 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -1031,6 +1031,9 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	if(system_cpu_support_sse2()) {
 		info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
 	}
+	if (system_cpu_support_avx2()) {
+		info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
+	}
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 118e18374a1..c6e92c6d89d 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS
 	bvh/qbvh_traversal.h
 	bvh/qbvh_volume.h
 	bvh/qbvh_volume_all.h
+	bvh/obvh_nodes.h
+	bvh/obvh_shadow_all.h
+	bvh/obvh_local.h
+	bvh/obvh_traversal.h
+	bvh/obvh_volume.h
+	bvh/obvh_volume_all.h
 )
 
 set(SRC_HEADERS
@@ -270,6 +276,8 @@ set(SRC_UTIL_HEADERS
 	../util/util_types_float3_impl.h
 	../util/util_types_float4.h
 	../util/util_types_float4_impl.h
+	../util/util_types_float8.h
+	../util/util_types_float8_impl.h
 	../util/util_types_int2.h
 	../util/util_types_int2_impl.h
 	../util/util_types_int3.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index de1503e5564..2ad55d041bf 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN
 /* Common QBVH functions. */
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_nodes.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_nodes.h"
+#endif
 #endif
 
 /* Regular BVH traversal */
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 6356c197dd1..8def71bc890 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_local.h"
+#  ifdef __KERNEL_AVX2__
+#    include "kernel/bvh/obvh_local.h"
+#  endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -254,6 +257,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          int max_hits)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    local_isect,
+			                                    local_object,
+			                                    lcg_state,
+			                                    max_hits);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index cfc567ff9ca..d525b29fd94 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_shadow_all.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_shadow_all.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          uint *num_hits)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect_array,
+			                                    visibility,
+			                                    max_hits,
+			                                    num_hits);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 551625eae78..e95d2408201 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -20,6 +20,9 @@
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_traversal.h"
 #endif
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_traversal.h"
+#endif
 
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
@@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          )
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect,
+			                                    visibility
+#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+			                                    , lcg_state,
+			                                    difl,
+			                                    extmax
+#  endif
+			                                    );
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index ead424aaaaf..4ca0dc2225e 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
 #define BVH_QSTACK_SIZE 384
-
+#define BVH_OSTACK_SIZE 768
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING			1
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index ce5fc7be33d..7d03855cb8f 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_volume.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_volume.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint visibility)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect,
+			                                    visibility);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 2ee29ac9c27..3d9b598914f 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_volume_all.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_volume_all.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -386,6 +389,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint visibility)
 {
 	switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+		case BVH_LAYOUT_BVH8:
+			return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+			                                    ray,
+			                                    isect_array,
+			                                    max_hits,
+			                                    visibility);
+#endif
 #ifdef __QBVH__
 		case BVH_LAYOUT_BVH4:
 			return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
new file mode 100644
index 00000000000..50bcfa79b6c
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_local.h
@@ -0,0 +1,409 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             LocalIntersection *local_isect,
+                                             int local_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_tex_fetch(__object_node, local_object);
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+
+	local_isect->num_hits = 0;
+
+	const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+		Transform ob_itfm;
+		isect_t = bvh_instance_motion_push(kg,
+		                                   local_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
+#else
+		isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+#endif
+		object = local_object;
+	}
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
+#ifdef __KERNEL_AVX2__
+				                                P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
+
+				int prim_addr2 = __float_as_int(leaf.y);
+				const uint type = __float_as_int(leaf.w);
+
+				/* Pop. */
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+
+				/* Primitive intersection. */
+				switch(type & PRIMITIVE_ALL) {
+					case PRIMITIVE_TRIANGLE: {
+						/* Intersect ray against primitive, */
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+							if(triangle_intersect_local(kg,
+							                            local_isect,
+							                            P,
+							                            dir,
+							                            object,
+							                            local_object,
+							                            prim_addr,
+							                            isect_t,
+							                            lcg_state,
+							                            max_hits))
+							{
+								return true;
+							}
+						}
+						break;
+					}
+#if BVH_FEATURE(BVH_MOTION)
+					case PRIMITIVE_MOTION_TRIANGLE: {
+						/* Intersect ray against primitive. */
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+							if(motion_triangle_intersect_local(kg,
+							                                   local_isect,
+							                                   P,
+							                                   dir,
+							                                   ray->time,
+							                                   object,
+							                                   local_object,
+							                                   prim_addr,
+							                                   isect_t,
+							                                   lcg_state,
+							                                   max_hits))
+							{
+								return true;
+							}
+						}
+						break;
+					}
+#endif
+					default:
+						break;
+				}
+			}
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+	return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h
new file mode 100644
index 00000000000..93f35f6dffb
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_nodes.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Aligned nodes intersection AVX code is adopted from Embree,
+ */
+
+struct OBVHStackItem {
+	int addr;
+	float dist;
+};
+
+ccl_device_inline void obvh_near_far_idx_calc(const float3& idir,
+                                              int *ccl_restrict near_x,
+                                              int *ccl_restrict near_y,
+                                              int *ccl_restrict near_z,
+                                              int *ccl_restrict far_x,
+                                              int *ccl_restrict far_y,
+                                              int *ccl_restrict far_z)
+
+{
+#ifdef __KERNEL_SSE__
+	*near_x = 0; *far_x = 1;
+	*near_y = 2; *far_y = 3;
+	*near_z = 4; *far_z = 5;
+
+	const size_t mask = movemask(ssef(idir.m128));
+
+	const int mask_x = mask & 1;
+	const int mask_y = (mask & 2) >> 1;
+	const int mask_z = (mask & 4) >> 2;
+
+	*near_x += mask_x; *far_x -= mask_x;
+	*near_y += mask_y; *far_y -= mask_y;
+	*near_z += mask_z; *far_z -= mask_z;
+#else
+	if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
+	if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
+	if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
+#endif
+}
+
+ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a,
+                                      OBVHStackItem *ccl_restrict b)
+{
+	OBVHStackItem tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3)
+{
+	if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+	if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+	if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4)
+{
+	if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+	if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); }
+	if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); }
+	if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); }
+	if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5)
+{
+	obvh_stack_sort(s1, s2, s3, s4);
+	if(s5->dist < s4->dist) {
+		obvh_item_swap(s4, s5);
+		if(s4->dist < s3->dist) {
+			obvh_item_swap(s3, s4);
+			if(s3->dist < s2->dist) {
+				obvh_item_swap(s2, s3);
+				if(s2->dist < s1->dist) {
+					obvh_item_swap(s1, s2);
+				}
+			}
+		}
+	}
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6)
+{
+	obvh_stack_sort(s1, s2, s3, s4, s5);
+	if(s6->dist < s5->dist) {
+		obvh_item_swap(s5, s6);
+		if(s5->dist < s4->dist) {
+			obvh_item_swap(s4, s5);
+			if(s4->dist < s3->dist) {
+				obvh_item_swap(s3, s4);
+				if(s3->dist < s2->dist) {
+					obvh_item_swap(s2, s3);
+					if(s2->dist < s1->dist) {
+						obvh_item_swap(s1, s2);
+					}
+				}
+			}
+		}
+	}
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6,
+                                       OBVHStackItem *ccl_restrict s7)
+{
+	obvh_stack_sort(s1, s2, s3, s4, s5, s6);
+	if(s7->dist < s6->dist) {
+		obvh_item_swap(s6, s7);
+		if(s6->dist < s5->dist) {
+			obvh_item_swap(s5, s6);
+			if(s5->dist < s4->dist) {
+				obvh_item_swap(s4, s5);
+				if(s4->dist < s3->dist) {
+					obvh_item_swap(s3, s4);
+					if(s3->dist < s2->dist) {
+						obvh_item_swap(s2, s3);
+						if(s2->dist < s1->dist) {
+							obvh_item_swap(s1, s2);
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6,
+                                       OBVHStackItem *ccl_restrict s7,
+                                       OBVHStackItem *ccl_restrict s8)
+{
+	obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
+	if(s8->dist < s7->dist) {
+		obvh_item_swap(s7, s8);
+		if(s7->dist < s6->dist) {
+			obvh_item_swap(s6, s7);
+			if(s6->dist < s5->dist) {
+				obvh_item_swap(s5, s6);
+				if(s5->dist < s4->dist) {
+					obvh_item_swap(s4, s5);
+					if(s4->dist < s3->dist) {
+						obvh_item_swap(s3, s4);
+						if(s3->dist < s2->dist) {
+							obvh_item_swap(s2, s3);
+							if(s2->dist < s1->dist) {
+								obvh_item_swap(s1, s2);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+                                                  const avxf& isect_near,
+                                                  const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+                                                  const avx3f& org_idir,
+#else
+                                                  const avx3f& org,
+#endif
+                                                  const avx3f& idir,
+                                                  const int near_x,
+                                                  const int near_y,
+                                                  const int near_z,
+                                                  const int far_x,
+                                                  const int far_y,
+                                                  const int far_z,
+                                                  const int node_addr,
+                                                  avxf *ccl_restrict dist)
+{
+	const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+	const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x);
+	const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y);
+	const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z);
+	const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x);
+	const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y);
+	const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z);
+
+	const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const avxb vmask = tnear <= tfar;
+	int mask = (int)movemask(vmask);
+	*dist = tnear;
+	return mask;
+#else
+	return 0;
+#endif
+}
+
+ccl_device_inline int obvh_aligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#else
+        const avx3f& P,
+#endif
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+	const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
+	const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
+	const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
+	const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
+	const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
+	const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const avxb vmask = round_down*tnear <= round_up*tfar;
+	int mask = (int)movemask(vmask);
+	*dist = tnear;
+	return mask;
+#else
+	return 0;
+#endif
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int obvh_unaligned_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& org_idir,
+#endif
+        const avx3f& org,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+	const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+	const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+	const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+	const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+	const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+	const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+	const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+	const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+	const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+	const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+	const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+	const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+	const avxf neg_one(-1.0f);
+	const avxf nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const avxf tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const avxf tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const avxf tnear_x = min(tlower_x, tupper_x);
+	const avxf tnear_y = min(tlower_y, tupper_y);
+	const avxf tnear_z = min(tlower_z, tupper_z);
+	const avxf tfar_x = max(tlower_x, tupper_x);
+	const avxf tfar_y = max(tlower_y, tupper_y);
+	const avxf tfar_z = max(tlower_z, tupper_z);
+	const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const avxb vmask = tnear <= tfar;
+	*dist = tnear;
+	return movemask(vmask);
+}
+
+ccl_device_inline int obvh_unaligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#endif
+        const avx3f& P,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+	const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+	const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+	const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+	const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+	const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+	const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+	const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+	const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+	const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+	const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+	const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+	const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+	const avxf neg_one(-1.0f);
+	const avxf nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const avxf tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const avxf tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+
+	const avxf tnear_x = min(tlower_x, tupper_x);
+	const avxf tnear_y = min(tlower_y, tupper_y);
+	const avxf tnear_z = min(tlower_z, tupper_z);
+	const avxf tfar_x = max(tlower_x, tupper_x);
+	const avxf tfar_y = max(tlower_y, tupper_y);
+	const avxf tfar_z = max(tlower_z, tupper_z);
+
+	const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const avxb vmask = round_down*tnear <= round_up*tfar;
+	*dist = tnear;
+	return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int obvh_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& org_idir,
+#endif
+        const avx3f& org,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return obvh_unaligned_node_intersect(kg,
+		                                     isect_near,
+		                                     isect_far,
+#ifdef __KERNEL_AVX2__
+		                                     org_idir,
+#endif
+		                                     org,
+		                                     dir,
+		                                     idir,
+		                                     near_x, near_y, near_z,
+		                                     far_x, far_y, far_z,
+		                                     node_addr,
+		                                     dist);
+	}
+	else {
+		return obvh_aligned_node_intersect(kg,
+		                                   isect_near,
+		                                   isect_far,
+#ifdef __KERNEL_AVX2__
+		                                   org_idir,
+#else
+		                                   org,
+#endif
+		                                   idir,
+		                                   near_x, near_y, near_z,
+		                                   far_x, far_y, far_z,
+		                                   node_addr,
+		                                   dist);
+	}
+}
+
+ccl_device_inline int obvh_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#endif
+        const avx3f& P,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return obvh_unaligned_node_intersect_robust(kg,
+		                                            isect_near,
+		                                            isect_far,
+#ifdef __KERNEL_AVX2__
+		                                            P_idir,
+#endif
+		                                            P,
+		                                            dir,
+		                                            idir,
+		                                            near_x, near_y, near_z,
+		                                            far_x, far_y, far_z,
+		                                            node_addr,
+		                                            difl,
+		                                            dist);
+	}
+	else {
+		return obvh_aligned_node_intersect_robust(kg,
+		                                          isect_near,
+		                                          isect_far,
+#ifdef __KERNEL_AVX2__
+		                                          P_idir,
+#else
+		                                          P,
+#endif
+		                                          idir,
+		                                          near_x, near_y, near_z,
+		                                          far_x, far_y, far_z,
+		                                          node_addr,
+		                                          difl,
+		                                          dist);
+	}
+}
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
new file mode 100644
index 00000000000..3e877065127
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const int skip_object,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+	/* TODO(sergey):
+	 *  - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
+
+				if(false
+#ifdef __VISIBILITY_FLAG__
+				   || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+#endif
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
+				) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+//#if !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+
+				if(child_mask != 0) {
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					if(p_type == PRIMITIVE_TRIANGLE) {
+						int prim_count = prim_addr2 - prim_addr;
+						if(prim_count < 3) {
+							while(prim_addr < prim_addr2) {
+								kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+								int hit = triangle_intersect(kg,
+								                             isect_array,
+								                             P,
+								                             dir,
+								                             PATH_RAY_SHADOW,
+								                             object,
+								                             prim_addr);
+								/* Shadow ray early termination. */
+								if(hit) {
+									/* detect if this surface has a shader with transparent shadows */
+
+									/* todo: optimize so primitive visibility flag indicates if
+									 * the primitive has a transparent shadow shader? */
+									int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+									int shader = 0;
+
+#ifdef __HAIR__
+									if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+									{
+										shader = kernel_tex_fetch(__tri_shader, prim);
+									}
+#ifdef __HAIR__
+									else {
+										float4 str = kernel_tex_fetch(__curves, prim);
+										shader = __float_as_int(str.z);
+									}
+#endif
+									int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+									/* if no transparent shadows, all light is blocked */
+									if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+										return true;
+									}
+									/* if maximum number of hits reached, block all light */
+									else if(*num_hits == max_hits) {
+										return true;
+									}
+
+									/* move on to next entry in intersections array */
+									isect_array++;
+									(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+
+									isect_array->t = isect_t;
+								}
+
+								prim_addr++;
+							} //while
+					} else {
+							kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+							int* nhiptr = &num_hits_in_instance;
+#else
+							int nhi= 0;
+							int *nhiptr = &nhi;
+#endif
+
+							int result = triangle_intersect8(kg,
+							                                 &isect_array,
+							                                 P,
+							                                 dir,
+							                                 PATH_RAY_SHADOW,
+							                                 object,
+							                                 prim_addr,
+							                                 prim_count,
+							                                 num_hits,
+							                                 max_hits,
+							                                 nhiptr,
+							                                 isect_t);
+							if(result == 2) {
+								return true;
+							}
+						}   // prim_count
+					}  // PRIMITIVE_TRIANGLE
+					else {
+							while(prim_addr < prim_addr2) {
+							kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+
+#ifdef __SHADOW_TRICKS__
+							uint tri_object = (object == OBJECT_NONE)
+									? kernel_tex_fetch(__prim_object, prim_addr)
+									: object;
+							if(tri_object == skip_object) {
+								++prim_addr;
+								continue;
+							}
+#endif
+
+							bool hit;
+
+							/* todo: specialized intersect functions which don't fill in
+							 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+							 * might give a few % performance improvement */
+
+							switch(p_type) {
+
+#if BVH_FEATURE(BVH_MOTION)
+								case PRIMITIVE_MOTION_TRIANGLE: {
+									hit = motion_triangle_intersect(kg,
+									                                isect_array,
+									                                P,
+									                                dir,
+									                                ray->time,
+									                                PATH_RAY_SHADOW,
+									                                object,
+									                                prim_addr);
+									break;
+								}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+								case PRIMITIVE_CURVE:
+								case PRIMITIVE_MOTION_CURVE: {
+									const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+									if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+										hit = cardinal_curve_intersect(kg,
+										                               isect_array,
+										                               P,
+										                               dir,
+										                               PATH_RAY_SHADOW,
+										                               object,
+										                               prim_addr,
+										                               ray->time,
+										                               curve_type,
+										                               NULL,
+										                               0, 0);
+									}
+									else {
+										hit = curve_intersect(kg,
+										                      isect_array,
+										                      P,
+										                      dir,
+										                      PATH_RAY_SHADOW,
+										                      object,
+										                      prim_addr,
+										                      ray->time,
+										                      curve_type,
+										                      NULL,
+										                      0, 0);
+									}
+									break;
+								}
+#endif
+								default: {
+									hit = false;
+									break;
+								}
+							}
+
+							/* Shadow ray early termination. */
+							if(hit) {
+								/* detect if this surface has a shader with transparent shadows */
+
+								/* todo: optimize so primitive visibility flag indicates if
+								 * the primitive has a transparent shadow shader? */
+								int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+								int shader = 0;
+
+#ifdef __HAIR__
+								if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+								{
+									shader = kernel_tex_fetch(__tri_shader, prim);
+								}
+#ifdef __HAIR__
+								else {
+									float4 str = kernel_tex_fetch(__curves, prim);
+									shader = __float_as_int(str.z);
+								}
+#endif
+								int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+								/* if no transparent shadows, all light is blocked */
+								if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+									return true;
+								}
+								/* if maximum number of hits reached, block all light */
+								else if(*num_hits == max_hits) {
+									return true;
+								}
+
+								/* move on to next entry in intersections array */
+								isect_array++;
+								(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+								num_hits_in_instance++;
+#endif
+
+								isect_array->t = isect_t;
+							}
+
+							prim_addr++;
+						}//while prim
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+#  else
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+#  endif
+
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+					obvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
+					tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+					idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+					node_addr = kernel_tex_fetch(__object_node, object);
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+#  endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
new file mode 100644
index 00000000000..2021d8e1143
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_traversal.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#  define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+	traversal_stack[0].dist = -FLT_MAX;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+	float node_dist = -FLT_MAX;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	BVH_DEBUG_INIT();
+	avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
+
+				if(UNLIKELY(node_dist > isect->t)
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
+#ifdef __VISIBILITY_FLAG__
+				   || (__float_as_uint(inodes.x) & visibility) == 0
+#endif
+				 )
+				{
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+
+				int child_mask;
+				avxf dist;
+
+				BVH_DEBUG_NEXT_NODE();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					child_mask = NODE_INTERSECT_ROBUST(kg,
+					                                   tnear,
+					                                   tfar,
+#  ifdef __KERNEL_AVX2__
+					                                   P_idir4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                                   org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+					                                   dir4,
+#  endif
+					                                   idir4,
+					                                   near_x, near_y, near_z,
+					                                   far_x, far_y, far_z,
+					                                   node_addr,
+					                                   difl,
+					                                   &dist);
+				}
+				else
+#endif  /* BVH_HAIR_MINIMUM_WIDTH */
+				{
+					child_mask = NODE_INTERSECT(kg,
+					                            tnear,
+					                            tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+				}
+
+				if(child_mask != 0) {
+					avxf cnodes;
+					/* TODO(sergey): Investigate whether moving cnodes upwards
+					 * gives a speedup (will be different cache pattern but will
+					 * avoid extra check here),
+					 */
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					float d0 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						node_dist = d0;
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							node_dist = d1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							node_dist = d0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					* stack items, continue with closest child.
+					*/
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				node_dist = traversal_stack[stack_ptr].dist;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+#ifdef __VISIBILITY_FLAG__
+				if(UNLIKELY((node_dist > isect->t) ||
+				            ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+				if(UNLIKELY((node_dist > isect->t)))
+#endif
+				{
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+					case PRIMITIVE_TRIANGLE: {
+						int prim_count = prim_addr2 - prim_addr;
+						if(prim_count < 3) {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								 BVH_DEBUG_NEXT_INTERSECTION();
+								 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								 if(triangle_intersect(kg,
+								                       isect,
+								                       P,
+								                       dir,
+								                       visibility,
+								                       object,
+								                       prim_addr))
+								 {
+									 tfar = avxf(isect->t);
+									 /* Shadow ray early termination. */
+									 if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										 return true;
+									 }
+								 }
+							}//for
+						}
+						else {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+							if(triangle_intersect8(kg,
+							                       &isect,
+							                       P,
+							                       dir,
+							                       visibility,
+							                       object,
+							                       prim_addr,
+							                       prim_count,
+							                       0,
+							                       0,
+							                       NULL,
+							                       0.0f))
+							{
+								tfar = avxf(isect->t);
+								if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									return true;
+								}
+							}
+						}//prim count
+						break;
+					}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_INTERSECTION();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								if(motion_triangle_intersect(kg,
+								                             isect,
+								                             P,
+								                             dir,
+								                             ray->time,
+								                             visibility,
+								                             object,
+								                             prim_addr))
+								{
+									tfar = avxf(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_INTERSECTION();
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
+								}
+								else {
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
+								}
+								if(hit) {
+									tfar = avxf(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
+#  else
+					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
+#  endif
+
+					obvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
+					tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+					idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+					traversal_stack[stack_ptr].dist = -FLT_MAX;
+
+					node_addr = kernel_tex_fetch(__object_node, object);
+
+					BVH_DEBUG_NEXT_INSTANCE();
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			node_dist = traversal_stack[stack_ptr].dist;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
new file mode 100644
index 00000000000..da9ddbd4f24
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_volume.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility)
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
+#ifdef __KERNEL_AVX2__
+				                                P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+#  if BVH_FEATURE(BVH_MOTION)
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+						obvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
+						tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+						idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+						node_addr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
new file mode 100644
index 00000000000..a88573e6f86
--- /dev/null
+++ b/intern/cycles/kernel/bvh/obvh_volume_all.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             const uint visibility)
+{
+	/* Traversal stack in CUDA thread-local memory. */
+	OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return 0;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+	avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+	obvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
+				avxf dist;
+				int child_mask = NODE_INTERSECT(kg,
+					                            tnear,
+					                            tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+
+				if(child_mask != 0) {
+					avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c3;
+						traversal_stack[stack_ptr].dist = d3;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+
+					/* Five children are hit, push all onto stack and sort 5
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c4 = __float_as_int(cnodes[r]);
+					float d4 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Six children are hit, push all onto stack and sort 6
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c5 = __float_as_int(cnodes[r]);
+					float d5 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c5;
+						traversal_stack[stack_ptr].dist = d5;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c4;
+						traversal_stack[stack_ptr].dist = d4;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c5;
+					traversal_stack[stack_ptr].dist = d5;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c4;
+					traversal_stack[stack_ptr].dist = d4;
+
+					/* Seven children are hit, push all onto stack and sort 7
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c6 = __float_as_int(cnodes[r]);
+					float d6 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c6;
+						traversal_stack[stack_ptr].dist = d6;
+						obvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2],
+						                &traversal_stack[stack_ptr - 3],
+						                &traversal_stack[stack_ptr - 4],
+						                &traversal_stack[stack_ptr - 5],
+						                &traversal_stack[stack_ptr - 6]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Eight children are hit, push all onto stack and sort 8
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c7 = __float_as_int(cnodes[r]);
+					float d7 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c7;
+					traversal_stack[stack_ptr].dist = d7;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c6;
+					traversal_stack[stack_ptr].dist = d6;
+					obvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3],
+					                &traversal_stack[stack_ptr - 4],
+					                &traversal_stack[stack_ptr - 5],
+					                &traversal_stack[stack_ptr - 6],
+					                &traversal_stack[stack_ptr - 7]);
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+					bool hit;
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#  else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#  endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#  if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#  endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#  if BVH_FEATURE(BVH_INSTANCING)
+#    if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#    else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#    endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#  endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+#  if BVH_FEATURE(BVH_MOTION)
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+#  else
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+#  endif
+
+						obvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
+						tfar = avxf(isect_t);
+						idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+#  ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+						node_addr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+#  endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			obvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+			idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 3036efd4198..2e622af1758 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -85,7 +85,8 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
 
 /* Axis-aligned nodes intersection */
 
-ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
                                                   const ssef& isect_near,
                                                   const ssef& isect_far,
 #ifdef __KERNEL_AVX2__
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index a2b1e050e58..f8c671fed14 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -1,4 +1,4 @@
-/*
+			/*
  * Copyright 2014, Blender Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -70,7 +70,479 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	return false;
 }
 
-/* Special ray intersection routines for local intersection. In that case we
+#define	cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D))
+#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
+ccl_device_inline
+#else
+ccl_device_forceinline
+#endif
+int ray_triangle_intersect8(KernelGlobals *kg,
+                            float3 ray_P,
+                            float3 ray_dir,
+                            Intersection **isect,
+                            uint visibility,
+                            int object,
+                            __m256 *triA,
+                            __m256 *triB,
+                            __m256 *triC,
+                            int prim_addr,
+                            int prim_num,
+                            uint *num_hits,
+                            uint max_hits,
+                            int *num_hits_in_instance,
+                            float isec_t)
+{
+
+	const unsigned char prim_num_mask = (1 << prim_num) - 1;
+
+	const __m256i zero256 = _mm256_setzero_si256();
+
+	const __m256 Px256 = _mm256_set1_ps(ray_P.x);
+	const __m256 Py256 = _mm256_set1_ps(ray_P.y);
+	const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
+
+	const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
+	const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
+	const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
+
+	/* Calculate vertices relative to ray origin. */
+	/*	const float3 v0 = tri_c - P;
+	const float3 v1 = tri_a - P;
+	const float3 v2 = tri_b - P; */
+
+	__m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
+	__m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
+	__m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
+
+	__m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
+	__m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
+	__m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
+
+	__m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
+	__m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
+	__m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
+
+	__m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
+	__m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
+	__m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
+
+	__m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
+	__m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
+	__m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
+
+	__m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
+	__m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
+	__m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
+
+	/* Calculate triangle edges.
+	const float3 e0 = v2 - v0;
+	const float3 e1 = v0 - v1;
+	const float3 e2 = v1 - v2;*/
+
+	__m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
+	__m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
+	__m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
+
+	__m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
+	__m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
+	__m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
+
+	__m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
+	__m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
+	__m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
+
+	/* Perform edge tests.
+	const float U = dot(cross(v2 + v0, e0), ray_dir);
+	const float V = dot(cross(v0 + v1, e1), ray_dir);
+	const float W = dot(cross(v1 + v2, e2), ray_dir);*/
+
+	//cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx)
+	__m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
+	__m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
+	__m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
+	//vertical dot
+	__m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
+	U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256));
+	U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256));
+
+	__m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
+	__m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
+	__m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
+	//vertical dot
+	__m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
+	V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256));
+	V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256));
+
+	__m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
+	__m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
+	__m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
+	//vertical dot
+	__m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
+	W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256));
+	W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256));
+
+	//const float minUVW = min(U, min(V, W));
+	//const float maxUVW = max(U, max(V, W));
+#if 0
+	__m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256));
+	__m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256));
+
+	//if(minUVW < 0.0f && maxUVW > 0.0f)
+	__m256i mask_minmaxUVW_256 = _mm256_and_si256(
+		_mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)),
+		//_mm256_castps_si256(minUVW_256),
+		_mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256));
+#else
+	__m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
+	__m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
+	__m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
+	__m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
+
+	const __m256i one256 = _mm256_set1_epi32(1);
+	const __m256i two256 = _mm256_set1_epi32(2);
+
+	__m256i mask_minmaxUVW_256 = _mm256_or_si256(
+		_mm256_cmpeq_epi32(one256, UVW_256_1),
+		_mm256_cmpeq_epi32(two256, UVW_256_1) );
+#endif
+
+	unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
+	if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set
+		return false;
+	}
+
+	/* Calculate geometry normal and denominator. */
+	//			const float3 Ng1 = cross(e1, e0);
+	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+
+	__m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
+	__m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
+	__m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
+
+	//const float3 Ng = Ng1 + Ng1;
+	Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
+	Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
+	Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
+
+	//const float den = dot3(Ng, dir);
+	//vertical dot
+	__m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
+	den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256));
+	den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256));
+
+	// __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+
+	/* Perform depth test. */
+	//const float T = dot3(v0, Ng);
+	__m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
+	T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256));
+	T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256));
+
+	//const int sign_den = (__float_as_int(den) & 0x80000000);
+	const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
+	__m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
+
+	//const float sign_T = xor_signmask(T, sign_den);
+	__m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
+
+	/*if((sign_T < 0.0f) || mask_minmaxUVW_pos {	return false;}	*/
+	unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
+	if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
+		return false;
+	} /**/
+
+	__m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
+
+
+	ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
+	ccl_align(32) unsigned int mask_minmaxUVW8[8];
+
+	if(visibility == PATH_RAY_SHADOW_OPAQUE){
+			__m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256
+
+			__m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
+
+			__m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
+			__m256 rayt_256 = _mm256_set1_ps((*isect)->t);
+
+			__m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256),
+				_mm256_castps_si256(
+					_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256)
+				)
+			);
+			/*	__m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256,
+			_mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256),
+			_CMP_GT_OS
+			) );*/
+
+			mask0 = _mm256_or_si256(mask1, mask0);
+			//unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0));
+			//unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256));
+			//unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos));
+			mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask)
+			mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
+
+			unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
+			if((mask_final & prim_num_mask) == 0) { //all bits NOT set
+				return false;
+			}		/**/
+
+			unsigned long i = 0;
+#if defined(_MSC_VER)
+			unsigned char res = _BitScanForward(&i, (unsigned long)mask_final);
+#else
+            i = __builtin_ffs(mask_final)-1;
+#endif
+
+			den_256 = _mm256_rcp_ps(den_256); //inv_den
+			U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den
+			V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den
+			T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den
+
+			_mm256_store_ps(U8, U_256);
+			_mm256_store_ps(V8, V_256);
+			_mm256_store_ps(T8, T_256);
+
+
+			//here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true
+
+			(*isect)->u = U8[i];
+			(*isect)->v = V8[i];
+			(*isect)->t = T8[i];
+
+			(*isect)->prim = (prim_addr + i);
+			(*isect)->object = object;
+			(*isect)->type = PRIMITIVE_TRIANGLE;
+
+			return true;
+		}
+	else {
+			_mm256_store_ps(den8, den_256);
+			_mm256_store_ps(U8, U_256);
+			_mm256_store_ps(V8, V_256);
+			_mm256_store_ps(T8, T_256);
+
+			_mm256_store_ps(sign_T8, sign_T_256);
+			_mm256_store_ps(xor_signmask8, xor_signmask_256);
+			_mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256);
+
+			int ret = false;
+
+			if(visibility == PATH_RAY_SHADOW) {
+				for(int i = 0; i < prim_num; i++) {
+					if(!mask_minmaxUVW8[i]) {
+#ifdef __VISIBILITY_FLAG__
+						if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+#endif
+						{
+							if((sign_T8[i] >= 0.0f) &&
+							   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
+							{
+								if(den8[i]) {
+									const float inv_den = 1.0f / den8[i];
+
+									(*isect)->u = U8[i] * inv_den;
+									(*isect)->v = V8[i] * inv_den;
+									(*isect)->t = T8[i] * inv_den;
+
+									(*isect)->prim = (prim_addr + i);
+									(*isect)->object = object;
+									(*isect)->type = PRIMITIVE_TRIANGLE;
+
+									int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
+									int shader = 0;
+
+#ifdef __HAIR__
+									if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+									{
+										shader = kernel_tex_fetch(__tri_shader, prim);
+									}
+#ifdef __HAIR__
+									else {
+										float4 str = kernel_tex_fetch(__curves, prim);
+										shader = __float_as_int(str.z);
+									}
+#endif
+									int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+									/* if no transparent shadows, all light is blocked */
+									if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+										return 2;
+									}
+									/* if maximum number of hits reached, block all light */
+									else if(*num_hits == max_hits) {
+										return 2;
+									}
+									/* move on to next entry in intersections array */
+									ret = true;
+
+									(*isect)++;
+									(*num_hits)++;
+
+									(*num_hits_in_instance)++;
+
+									(*isect)->t = isec_t;
+
+								} //den
+							} //if sign
+						} //vis
+					}//if mask
+				} //for
+		}
+		else { //default case
+			for(int i = 0; i < prim_num; i++) {
+				if(!mask_minmaxUVW8[i]) {
+#ifdef __VISIBILITY_FLAG__
+					if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility)
+#endif
+					{
+						if((sign_T8[i] >= 0.0f) &&
+						   (sign_T8[i] <= (*isect)->t * xor_signmask8[i]))
+						{
+							if(den8[i]) {
+								const float inv_den = 1.0f / den8[i];
+
+								(*isect)->u = U8[i] * inv_den;
+								(*isect)->v = V8[i] * inv_den;
+								(*isect)->t = T8[i] * inv_den;
+
+								(*isect)->prim = (prim_addr + i);
+								(*isect)->object = object;
+								(*isect)->type = PRIMITIVE_TRIANGLE;
+
+								ret = true;
+							} //den
+						} //if sign
+					} //vis
+				}//if mask
+			} //for
+		} //default
+	return ret;
+}// else PATH_RAY_SHADOW_OPAQUE
+
+}
+
+//vz static
+ccl_device_inline
+int triangle_intersect8(KernelGlobals *kg,
+                        Intersection **isect,
+                        float3 P,
+                        float3 dir,
+                        uint visibility,
+                        int object,
+                        int prim_addr,
+                        int prim_num,
+                        uint *num_hits,
+                        uint max_hits,
+                        int *num_hits_in_instance,
+                        float isec_t)
+ {
+	__m128 tri_a[8], tri_b[8], tri_c[8];
+	__m256  tritmp[12], tri[12];
+	__m256  triA[3], triB[3], triC[3];
+
+	int i, r;
+
+	uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+	for(i = 0; i < prim_num; i++) {
+		tri_a[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+		tri_b[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+		tri_c[i] = *(__m128*)&kg->__prim_tri_verts.data[tri_vindex++];
+	 }
+	//create 9 or  12 placeholders
+	tri[0] = _mm256_castps128_ps256(tri_a[0]);    //_mm256_zextps128_ps256
+	tri[1] = _mm256_castps128_ps256(tri_b[0]);//_mm256_zextps128_ps256
+	tri[2] = _mm256_castps128_ps256(tri_c[0]);//_mm256_zextps128_ps256
+
+	tri[3] = _mm256_castps128_ps256(tri_a[1]);    //_mm256_zextps128_ps256
+	tri[4] = _mm256_castps128_ps256(tri_b[1]);//_mm256_zextps128_ps256
+	tri[5] = _mm256_castps128_ps256(tri_c[1]);//_mm256_zextps128_ps256
+
+	tri[6] = _mm256_castps128_ps256(tri_a[2]);    //_mm256_zextps128_ps256
+	tri[7] = _mm256_castps128_ps256(tri_b[2]);//_mm256_zextps128_ps256
+	tri[8] = _mm256_castps128_ps256(tri_c[2]);//_mm256_zextps128_ps256
+
+	if(prim_num > 3) {
+		tri[9] =  _mm256_castps128_ps256(tri_a[3]);    //_mm256_zextps128_ps256
+		tri[10] = _mm256_castps128_ps256(tri_b[3]);//_mm256_zextps128_ps256
+		tri[11] = _mm256_castps128_ps256(tri_c[3]);//_mm256_zextps128_ps256
+	}
+
+	for(i = 4, r = 0; i < prim_num; i ++, r += 3) {
+		tri[r] =     _mm256_insertf128_ps(tri[r] , tri_a[i], 1);
+		tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
+		tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
+	 }
+
+	//------------------------------------------------
+	//0!  Xa0 Ya0 Za0 1 Xa4 Ya4 Za4  1
+	//1!  Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
+	//2!  Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
+
+	//3!  Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
+	//4!  Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5  1
+	//5!  Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
+
+	//6!  Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
+	//7!  Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6  1
+	//8!  Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
+
+	//9!  Xa3 Ya3 Za3 1 Xa7 Ya7 Za7  1
+	//10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7  1
+	//11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7  1
+
+	//"transpose"
+	tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]);   //0!  Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
+	tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]);   //1!  Za0 Za1 1   1   Za4 Za5  1   1
+
+	tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]);   //2!  Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
+	tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]);   //3!  Za2 Za3  1   1  Za6 Za7  1   1
+
+	tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]);   //4!  Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
+	tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]);   //5!  Zb0 Zb1  1  1   Zb4 Zb5  1   1
+
+	tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]);  //6!  Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
+	tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]);  //7!  Zb2 Zb3  1    1 Zb6 Zb7  1   1
+
+	tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]);   //8!  Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
+	tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]);   //9!  Zc0 Zc1  1   1  Zc4 Zc5  1   1
+
+	tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
+	tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); //11! Zc2 Zc3  1   1  Zc6 Zc7  1   1
+
+				/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+	triA[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2]))); 	//  Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
+	triA[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), _mm256_castps_pd(tritmp[2])));   //  Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
+	triA[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), _mm256_castps_pd(tritmp[3])));   //  Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
+
+	triB[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6]))); 	//  Xb0 Xb1  Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
+	triB[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), _mm256_castps_pd(tritmp[6])));   //  Yb0 Yb1  Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
+	triB[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), _mm256_castps_pd(tritmp[7]))); //    Zb0 Zb1  Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
+
+	triC[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10])));     //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
+	triC[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), _mm256_castps_pd(tritmp[10])));     //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
+	triC[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), _mm256_castps_pd(tritmp[11])));     //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
+
+			  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
+
+	int result = ray_triangle_intersect8(kg, P,
+	                                     dir,
+	                                     isect,
+	                                     visibility, object,
+	                                     triA,
+	                                     triB,
+	                                     triC,
+	                                     prim_addr,
+	                                     prim_num,
+	                                     num_hits,
+	                                     max_hits,
+	                                     num_hits_in_instance,
+	                                     isec_t);
+	return result;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point.
  * Returns whether traversal should be stopped.
@@ -83,7 +555,7 @@ ccl_device_inline bool triangle_intersect_local(
         float3 P,
         float3 dir,
         int object,
-		int local_object,
+        int local_object,
         int prim_addr,
         float tmax,
         uint *lcg_state,
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 2f955741797..aa7a16afa1d 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -71,15 +71,13 @@ CCL_NAMESPACE_BEGIN
 /* Texture types to be compatible with CUDA textures. These are really just
  * simple arrays and after inlining fetch hopefully revert to being a simple
  * pointer lookup. */
-
 template<typename T> struct texture  {
 	ccl_always_inline const T& fetch(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
 		return data[index];
 	}
-
-#ifdef __KERNEL_AVX__
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 	/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
 	 * compatibility with existing indicies and data structures.
 	 */
@@ -90,7 +88,6 @@ template<typename T> struct texture  {
 		ssef *ssef_node_data = &ssef_data[index];
 		return _mm256_loadu_ps((float *)ssef_node_data);
 	}
-
 #endif
 
 #ifdef __KERNEL_SSE2__
@@ -148,6 +145,10 @@ ccl_device_inline void print_sse3i(const char *label, sse3i& a)
 	print_ssei(label, a.z);
 }
 
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+#endif
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index d169915cff9..e93100a6442 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1384,8 +1384,9 @@ typedef enum KernelBVHLayout {
 
 	BVH_LAYOUT_BVH2 = (1 << 0),
 	BVH_LAYOUT_BVH4 = (1 << 1),
+	BVH_LAYOUT_BVH8 = (1 << 2),
 
-	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4,
+	BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
 	BVH_LAYOUT_ALL = (unsigned int)(-1),
 } KernelBVHLayout;
 
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 508f44e7c4d..291f9a9fcae 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -78,6 +78,7 @@ set(SRC_HEADERS
 	util_sky_model.h
 	util_sky_model_data.h
 	util_avxf.h
+	util_avxb.h
 	util_sseb.h
 	util_ssef.h
 	util_ssei.h
@@ -98,7 +99,9 @@ set(SRC_HEADERS
 	util_types_float3_impl.h
 	util_types_float4.h
 	util_types_float4_impl.h
-	util_types_int2.h
+	util_types_float8.h
+	util_types_float8_impl.h
+		util_types_int2.h
 	util_types_int2_impl.h
 	util_types_int3.h
 	util_types_int3_impl.h
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
new file mode 100644
index 00000000000..865549be283
--- /dev/null
+++ b/intern/cycles/util/util_avxb.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxf;
+
+/*! 4-wide SSE bool type. */
+struct avxb
+{
+	typedef avxb Mask;                    // mask type
+	typedef avxf Float;                   // float type
+
+	enum   { size = 8 };                  // number of SIMD elements
+	union  { __m256 m256; int32_t v[8]; };  // data
+
+	////////////////////////////////////////////////////////////////////////////////
+	/// Constructors, Assignment & Cast Operators
+	////////////////////////////////////////////////////////////////////////////////
+	
+	__forceinline avxb           ( ) {}
+	__forceinline avxb           ( const avxb& other ) { m256 = other.m256; }
+	__forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; }
+
+	__forceinline avxb( const __m256  input ) : m256(input) {}
+	__forceinline operator const __m256&( void ) const { return m256; }
+	__forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
+	__forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
+	
+	//__forceinline avxb           ( bool  a )
+	//	: m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+	//__forceinline avxb           ( bool  a, bool  b) 
+	//	: m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+	//__forceinline avxb           ( bool  a, bool  b, bool  c, bool  d)
+	//	: m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+	//__forceinline avxb(int mask) {
+	//	assert(mask >= 0 && mask < 16);
+	//	m128 = _mm_lookupmask_ps[mask];
+	//}
+
+	////////////////////////////////////////////////////////////////////////////////
+	/// Constants
+	////////////////////////////////////////////////////////////////////////////////
+
+	__forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
+	__forceinline avxb( TrueTy  ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
+
+	////////////////////////////////////////////////////////////////////////////////
+	/// Array Access
+	////////////////////////////////////////////////////////////////////////////////
+
+	__forceinline bool   operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
+	__forceinline int32_t& operator []( const size_t i )       { assert(i < 8); return v[i]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
+__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
+__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
+__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
+__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+
+__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
+#if defined(__KERNEL_SSE41__)
+	return _mm256_blendv_ps(f, t, m); 
+#else
+	return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f)); 
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
+__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
+
+#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
+                                                      ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
+__forceinline const avxb shuffle( const avxb& a ) {
+	return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0)));
+}
+
+/*
+template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
+	return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
+	return _mm_movehl_ps(a, a);
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
+	return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
+	return _mm_movelh_ps(a, b);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
+	return _mm_movehl_ps(b, a);
+}
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
+template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
+#endif
+
+#if defined(__KERNEL_SSE41__)
+template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
+template<size_t dst>             __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
+#endif
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reduction Operations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
+#else
+__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+
+                                                      bool(a[5])+bool(a[6])+bool(a[7]); }
+#endif
+
+__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; }
+__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; }
+__forceinline bool all       ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; }
+__forceinline bool any       ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; }
+__forceinline bool none      ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; }
+
+__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxb(const char *label, const avxb &a)
+{
+	printf("%s: %df %df %df %df %df %df %df %d\n",
+	       label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+//#endif
+
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 2451213963a..81365f96c94 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -19,7 +19,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_AVX__
+struct avxb;
+
 struct avxf
 {
 	typedef avxf Float;
@@ -53,6 +54,9 @@ struct avxf
 	__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
 		m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
 
+	__forceinline avxf(float3 a) :
+		m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {}
+
 
 	__forceinline avxf(int a3, int a2, int a1, int a0)
 	{
@@ -73,8 +77,24 @@ struct avxf
 		m256 = _mm256_insertf128_ps(foo, b, 1);
 	}
 
+	__forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; }
+	__forceinline       float& operator [](const size_t i) { assert(i < 8); return f[i]; }
 };
 
+__forceinline avxf cross(const avxf& a, const avxf& b)
+{
+	avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5],
+		   0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]);
+	return r;
+}
+
+__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2)
+{
+	const avxf t = _mm256_mul_ps(a.m256, b.m256);
+	den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+	den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6];
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Unary Operators
 ////////////////////////////////////////////////////////////////////////////////
@@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256
 
 __forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
 
+__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); }
+__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); }
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Movement/Shifting/Shuffling Functions
 ////////////////////////////////////////////////////////////////////////////////
@@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
 	return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
 }
 
+//#if defined(__KERNEL_SSE41__)
+__forceinline avxf maxi(const avxf& a, const avxf& b) {
+	const avxf ci = _mm256_max_ps(a, b);
+	return ci;
+}
+
+__forceinline avxf mini(const avxf& a, const avxf& b) {
+	const avxf ci = _mm256_min_ps(a, b);
+	return ci;
+}
+//#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Ternary Operators
 ////////////////////////////////////////////////////////////////////////////////
@@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
 	return c-(a*b);
 #endif
 }
+__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
+	return _mm256_fmsub_ps(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __KERNEL_AVX2__
+__forceinline const avxb operator <=(const avxf& a, const avxf& b) {
+	return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
+}
+#endif
+
 #endif
 
 #ifndef _mm256_set_m128
@@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
 
 CCL_NAMESPACE_END
 
-#endif
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 9a66a372822..a761c9c46de 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -57,7 +57,19 @@ void DebugFlags::CPU::reset()
 #undef STRINGIFY
 #undef CHECK_CPU_FLAGS
 
-	bvh_layout = BVH_LAYOUT_DEFAULT;
+	if (getenv("CYCLES_BVH2") != NULL) {
+		bvh_layout = BVH_LAYOUT_BVH2;
+	}
+	else if (getenv("CYCLES_BVH4") != NULL) {
+		bvh_layout = BVH_LAYOUT_BVH4;
+	}
+	else if (getenv("CYCLES_BVH8") != NULL) {
+		bvh_layout = BVH_LAYOUT_BVH8;
+	}
+	else {
+		bvh_layout = BVH_LAYOUT_DEFAULT;
+	}
+
 	split_kernel = false;
 }
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index dfe755a8789..96c549b9be5 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -121,6 +121,7 @@ CCL_NAMESPACE_END
 #include "util/util_types_float2.h"
 #include "util/util_types_float3.h"
 #include "util/util_types_float4.h"
+#include "util/util_types_float8.h"
 
 #include "util/util_types_vector3.h"
 
@@ -140,6 +141,7 @@ CCL_NAMESPACE_END
 #include "util/util_types_float2_impl.h"
 #include "util/util_types_float3_impl.h"
 #include "util/util_types_float4_impl.h"
+#include "util/util_types_float8_impl.h"
 
 #include "util/util_types_vector3_impl.h"
 
@@ -148,7 +150,10 @@ CCL_NAMESPACE_END
 #  include "util/util_sseb.h"
 #  include "util/util_ssei.h"
 #  include "util/util_ssef.h"
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+#  include "util/util_avxb.h"
 #  include "util/util_avxf.h"
 #endif
+#endif
 
 #endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h
new file mode 100644
index 00000000000..2e693d49634
--- /dev/null
+++ b/intern/cycles/util/util_types_float8.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_H__
+#define __UTIL_TYPES_FLOAT8_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct ccl_try_align(16) float8 {
+#ifdef __KERNEL_AVX2__
+	union {
+		__m256 m256;
+		struct { float a, b, c, d, e, f, g, h; };
+	};
+
+	__forceinline float8();
+	__forceinline float8(const float8& a);
+	__forceinline explicit float8(const __m256& a);
+
+	__forceinline operator const __m256&(void) const;
+	__forceinline operator __m256&(void);
+
+	__forceinline float8& operator =(const float8& a);
+
+#else  /* __KERNEL_AVX2__ */
+	float a, b, c, d, e, f, g, h;
+#endif  /* __KERNEL_AVX2__ */
+
+	__forceinline float operator[](int i) const;
+	__forceinline float& operator[](int i);
+};
+
+ccl_device_inline float8 make_float8(float f);
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+                                     float e, float f, float g, float h);
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT8_H__ */
diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h
new file mode 100644
index 00000000000..4fac03569e9
--- /dev/null
+++ b/intern/cycles/util/util_types_float8_impl.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
+#define __UTIL_TYPES_FLOAT8_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+#  include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_AVX2__
+__forceinline float8::float8()
+{
+}
+
+__forceinline float8::float8(const float8& f)
+        : m256(f.m256)
+{
+}
+
+__forceinline float8::float8(const __m256& f)
+        : m256(f)
+{
+}
+
+__forceinline float8::operator const __m256&(void) const
+{
+	return m256;
+}
+
+__forceinline float8::operator __m256&(void)
+{
+	return m256;
+}
+
+__forceinline float8& float8::operator =(const float8& f)
+{
+	m256 = f.m256;
+	return *this;
+}
+#endif  /* __KERNEL_AVX2__ */
+
+__forceinline float float8::operator[](int i) const
+{
+	util_assert(i >= 0);
+	util_assert(i < 8);
+	return *(&a + i);
+}
+
+__forceinline float& float8::operator[](int i)
+{
+	util_assert(i >= 0);
+	util_assert(i < 8);
+	return *(&a + i);
+}
+
+ccl_device_inline float8 make_float8(float f)
+{
+#ifdef __KERNEL_AVX2__
+	float8 r(_mm256_set1_ps(f));
+#else
+	float8 r = {f, f, f, f, f, f, f, f};
+#endif
+	return r;
+}
+
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+                                     float e, float f, float g, float h)
+{
+#ifdef __KERNEL_AVX2__
+	float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
+#else
+	float8 r = {a, b, c, d, e, f, g, h};
+#endif
+	return r;
+}
+
+#endif  /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_TYPES_FLOAT8_IMPL_H__ */