17 files changed, 1616 insertions, 33 deletions
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index b591d5973fe..e7141c9ec64 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -479,8 +479,11 @@ void BVH::pack_instances(size_t nodes_size)
 
 				pack_nodes[pack_nodes_offset + nsize_bbox] = data;
 
-				if(use_qbvh)
-					pack_nodes[pack_nodes_offset + nsize_bbox+1] = bvh_nodes[i + nsize_bbox+1];
+				if(use_qbvh) {
+					memcpy(&pack_nodes[pack_nodes_offset + nsize_bbox+1],
+					       &bvh_nodes[i + nsize_bbox+1],
+					       sizeof(int4) * (nsize - (nsize_bbox+1)));
+				}
 
 				pack_nodes_offset += nsize;
 			}
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index ca1065f114a..f8d2ee60a3a 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -123,6 +123,11 @@ set(SRC_GEOM_HEADERS
 	geom/geom_motion_triangle.h
 	geom/geom_object.h
 	geom/geom_primitive.h
+	geom/geom_qbvh.h
+	geom/geom_qbvh_shadow.h
+	geom/geom_qbvh_subsurface.h
+	geom/geom_qbvh_traversal.h
+	geom/geom_qbvh_volume.h
 	geom/geom_triangle.h
 	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 3a768f37dd9..38fd7858a99 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -21,6 +21,7 @@
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
 #define BVH_NODE_SIZE 4
+#define BVH_QNODE_SIZE 7
 #define TRI_NODE_SIZE 3
 
 /* silly workaround for float extended precision that happens when compiling
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index a9892679e24..c0eefcd9c7f 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -48,6 +48,11 @@ CCL_NAMESPACE_BEGIN
 
 #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
+/* Common QBVH functions. */
+#ifdef __QBVH__
+#include "geom_qbvh.h"
+#endif
+
 /* Regular BVH traversal */
 
 #define BVH_FUNCTION_NAME bvh_intersect
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 4bdfc7478aa..d6056026f24 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_shadow.h"
+#endif
+
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
@@ -380,11 +384,23 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint max_hits,
                                          uint *num_hits)
 {
-	return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-	                                   ray,
-	                                   isect_array,
-	                                   max_hits,
-	                                   num_hits);
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits,
+		                                    num_hits);
+	}
+	else
+#endif
+	{
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits,
+		                                   num_hits);
+	}
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index 90cbbc08153..ff462142f6f 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_subsurface.h"
+#endif
+
 /* This is a template BVH traversal function for subsurface scattering, where
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
@@ -300,12 +304,25 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          uint *lcg_state,
                                          int max_hits)
 {
-	return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-	                                   ray,
-	                                   isect_array,
-	                                   subsurface_object,
-	                                   lcg_state,
-	                                   max_hits);
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    subsurface_object,
+		                                    lcg_state,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   subsurface_object,
+		                                   lcg_state,
+		                                   max_hits);
+	}
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index d48eda5f554..6e5b6ea476e 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_traversal.h"
+#endif
+
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
@@ -381,16 +385,33 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 #endif
                                          )
 {
-	return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-	                                   ray,
-	                                   isect,
-	                                   visibility
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect,
+		                                    visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                    , lcg_state,
+		                                    difl,
+		                                    extmax
+#endif
+		                                    );
+	}
+	else
+#endif
+	{
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect,
+		                                   visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-	                                   , lcg_state,
-	                                   difl,
-	                                   extmax
+		                                   , lcg_state,
+		                                   difl,
+		                                   extmax
 #endif
-	                                   );
+		                                   );
+	}
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
index bae90c2a24a..8a25b5dc884 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_volume.h"
+#endif
+
 /* This is a template BVH traversal function for volumes, where
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
@@ -314,9 +318,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect)
 {
-	return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-	                                   ray,
-	                                   isect);
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect);
+	}
+	else
+#endif
+	{
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect);
+	}
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
new file mode 100644
index 00000000000..a1dd89c41ca
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ccl_device_inline void qbvh_stack_sort(int *__restrict s1,
+                                       int *__restrict s2,
+                                       int *__restrict s3,
+                                       float *__restrict d1,
+                                       float *__restrict d2,
+                                       float *__restrict d3)
+{
+	if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
+	if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); }
+	if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
+}
+
+ccl_device_inline void qbvh_stack_sort(int *__restrict s1,
+                                       int *__restrict s2,
+                                       int *__restrict s3,
+                                       int *__restrict s4,
+                                       float *__restrict d1,
+                                       float *__restrict d2,
+                                       float *__restrict d3,
+                                       float *__restrict d4)
+{
+	if(*d2 < *d1) { util_swap(s2, s1); util_swap(d2, d1); }
+	if(*d4 < *d3) { util_swap(s4, s3); util_swap(d4, d3); }
+	if(*d3 < *d1) { util_swap(s3, s1); util_swap(d3, d1); }
+	if(*d4 < *d2) { util_swap(s4, s2); util_swap(d4, d2); }
+	if(*d3 < *d2) { util_swap(s3, s2); util_swap(d3, d2); }
+}
+
+ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
+                                          const ssef& tnear,
+                                          const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                          const sse3f& org_idir,
+#else
+                                          const sse3f& org,
+#endif
+                                          const sse3f& idir,
+                                          const int near_x,
+                                          const int near_y,
+                                          const int near_z,
+                                          const int far_x,
+                                          const int far_y,
+                                          const int far_z,
+                                          const int nodeAddr,
+                                          ssef *__restrict dist)
+{
+	const int offset = nodeAddr*BVH_QNODE_SIZE;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
+#endif
+
+#ifdef __KERNEL_SSE41__
+	const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear));
+	const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar));
+	const sseb vmask = cast(tNear) > cast(tFar);
+	int mask = (int)movemask(vmask)^0xf;
+#else
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = tNear <= tFar;
+	int mask = (int)movemask(vmask);
+#endif
+	*dist = tNear;
+	return mask;
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
+                                                 const ssef& tnear,
+                                                 const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                                 const sse3f& P_idir,
+#else
+                                                 const sse3f& P,
+#endif
+                                                 const sse3f& idir,
+                                                 const int near_x,
+                                                 const int near_y,
+                                                 const int near_z,
+                                                 const int far_x,
+                                                 const int far_y,
+                                                 const int far_z,
+                                                 const int nodeAddr,
+                                                 const float difl,
+                                                 ssef *__restrict dist)
+{
+	const int offset = nodeAddr*BVH_QNODE_SIZE;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
+#endif
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = round_down*tNear <= round_up*tFar;
+	*dist = tNear;
+	return (int)movemask(vmask);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
new file mode 100644
index 00000000000..f8279996450
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@@ -0,0 +1,378 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+	/* TODO(sergey):
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+	ssef tnear(0.0f), tfar(tmax);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							traversalStack[stackPtr] = c0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							traversalStack[stackPtr] = c1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					traversalStack[stackPtr] = c1;
+					++stackPtr;
+					traversalStack[stackPtr] = c0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						traversalStack[stackPtr] = c2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2],
+						                &d2, &d1, &d0);
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					traversalStack[stackPtr] = c3;
+					++stackPtr;
+					traversalStack[stackPtr] = c2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3],
+					                &d3, &d2, &d1, &d0);
+				}
+
+				nodeAddr = traversalStack[stackPtr];
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+#ifdef __VISIBILITY_FLAG__
+					if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+						continue;
+					}
+#endif
+
+					/* Primitive intersection. */
+					while(primAddr < primAddr2) {
+						bool hit;
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* Shadow ray early termination. */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+							{
+								shader =  kernel_tex_fetch(__tri_shader, prim);
+							}
+#ifdef __HAIR__
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect_t);
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(tmax);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
new file mode 100644
index 00000000000..bc43d81f9d3
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@@ -0,0 +1,300 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             int subsurface_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - SSE for hair.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+	uint num_hits = 0;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							traversalStack[stackPtr] = c0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							traversalStack[stackPtr] = c1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					traversalStack[stackPtr] = c1;
+					++stackPtr;
+					traversalStack[stackPtr] = c0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						traversalStack[stackPtr] = c2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2],
+						                &d2, &d1, &d0);
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					traversalStack[stackPtr] = c3;
+					++stackPtr;
+					traversalStack[stackPtr] = c2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3],
+					                &d3, &d2, &d1, &d0);
+				}
+
+				nodeAddr = traversalStack[stackPtr];
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* Primitive intersection. */
+					for(; primAddr < primAddr2; primAddr++) {
+						/* only primitives from the same object */
+						uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+
+						if(tri_object != subsurface_object)
+							continue;
+
+						/* Intersect ray against primitive */
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+								break;
+							}
+#endif
+							default: {
+								break;
+							}
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
+						object = subsurface_object;
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
new file mode 100644
index 00000000000..56289900e80
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@@ -0,0 +1,361 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+	isect->num_traversal_steps = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(ray->t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int traverseChild;
+				ssef dist;
+
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					traverseChild = qbvh_node_intersect_robust(kg,
+					                                           tnear,
+					                                           tfar,
+#ifdef __KERNEL_AVX2__
+					                                           P_idir4,
+#else
+					                                           org,
+#endif
+					                                           idir4,
+					                                           near_x, near_y, near_z,
+					                                           far_x, far_y, far_z,
+					                                           nodeAddr,
+					                                           difl,
+					                                           &dist);
+				}
+				else
+#endif
+				{
+					traverseChild = qbvh_node_intersect(kg,
+					                                    tnear,
+					                                    tfar,
+#ifdef __KERNEL_AVX2__
+					                                    P_idir4,
+#else
+					                                    org,
+#endif
+					                                    idir4,
+					                                    near_x, near_y, near_z,
+					                                    far_x, far_y, far_z,
+					                                    nodeAddr,
+					                                    &dist);
+				}
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							traversalStack[stackPtr] = c0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							traversalStack[stackPtr] = c1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					traversalStack[stackPtr] = c1;
+					++stackPtr;
+					traversalStack[stackPtr] = c0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						traversalStack[stackPtr] = c2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2],
+						                &d2, &d1, &d0);
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					traversalStack[stackPtr] = c3;
+					++stackPtr;
+					traversalStack[stackPtr] = c2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3],
+					                &d3, &d2, &d1, &d0);
+				}
+
+				nodeAddr = traversalStack[stackPtr];
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+#ifdef __VISIBILITY_FLAG__
+					if((__float_as_uint(leaf.z) & visibility) == 0) {
+						continue;
+					}
+#endif
+
+					/* Primitive intersection. */
+					while(primAddr < primAddr2) {
+						bool hit;
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								else
+									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+#if defined(__KERNEL_DEBUG__)
+						isect->num_traversal_steps++;
+#endif
+
+						/* Shadow ray early termination. */
+						if(hit) {
+							tfar = ssef(isect->t);
+							if(visibility == PATH_RAY_SHADOW_OPAQUE)
+								return true;
+						}
+
+						primAddr++;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect->t);
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h
new file mode 100644
index 00000000000..3630436bddc
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h
@@ -0,0 +1,320 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	ssef tnear(0.0f), tfar(ray->t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
+
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							traversalStack[stackPtr] = c0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							traversalStack[stackPtr] = c1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					traversalStack[stackPtr] = c1;
+					++stackPtr;
+					traversalStack[stackPtr] = c0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						traversalStack[stackPtr] = c2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2],
+						                &d2, &d1, &d0);
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					traversalStack[stackPtr] = c3;
+					++stackPtr;
+					traversalStack[stackPtr] = c2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3],
+					                &d3, &d2, &d1, &d0);
+				}
+
+				nodeAddr = traversalStack[stackPtr];
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_QNODE_SIZE+6);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* Primitive intersection. */
+					for(; primAddr < primAddr2; primAddr++) {
+						/* Only primitives from volume object. */
+						uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+						int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+
+						if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+							continue;
+						}
+
+						/* Intersect ray against primitive. */
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								triangle_intersect(kg, &isect_precalc, isect, P, dir, visibility, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								break;
+							}
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect->t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 2f0b78ea073..8140a3b7725 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -52,7 +52,6 @@ template<typename T> struct texture  {
 		return data[index];
 	}
 
-#if 0
 	ccl_always_inline ssef fetch_ssef(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
@@ -64,7 +63,6 @@ template<typename T> struct texture  {
 		kernel_assert(index >= 0 && index < width);
 		return ((ssei*)data)[index];
 	}
-#endif
 
 	T *data;
 	int width;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 460ca7b68eb..1bc5cf1fc32 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -57,6 +57,9 @@ CCL_NAMESPACE_BEGIN
 
 /* device capabilities */
 #ifdef __KERNEL_CPU__
+#ifdef __KERNEL_SSE2__
+#  define __QBVH__
+#endif
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #define __BRANCHED_PATH__
@@ -947,8 +950,8 @@ typedef struct KernelBVH {
 	int have_motion;
 	int have_curves;
 	int have_instancing;
-
-	int pad1, pad2, pad3;
+	int use_qbvh;
+	int pad1, pad2;
 } KernelBVH;
 
 typedef enum CurveFlag {
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 6137f7d4fdc..f8671db18dd 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1027,6 +1027,7 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	}
 
 	dscene->data.bvh.root = pack.root_index;
+	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
 }
 
 void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
@@ -1094,7 +1095,12 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update) {
-			pool.push(function_bind(&Mesh::compute_bvh, mesh, &scene->params, &progress, i, num_bvh));
+			pool.push(function_bind(&Mesh::compute_bvh,
+			                        mesh,
+			                        &scene->params,
+			                        &progress,
+			                        i,
+			                        num_bvh));
 			i++;
 		}
 	}
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 5d205225d97..51324edd8ff 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -26,6 +26,7 @@
 
 #include "util_param.h"
 #include "util_string.h"
+#include "util_system.h"
 #include "util_thread.h"
 #include "util_types.h"
 #include "util_vector.h"
@@ -135,11 +136,7 @@ public:
 		bvh_type = BVH_DYNAMIC;
 		use_bvh_cache = false;
 		use_bvh_spatial_split = false;
-#ifdef __QBVH__
-		use_qbvh = true;
-#else
 		use_qbvh = false;
-#endif
 		persistent_data = false;
 	}