From 4355603790712032e89fa4da6d8ce7f3ede62b4f Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 11 Jul 2016 12:28:45 +0200
Subject: Cycles: Move BVK kernel files to own directory

BVH traversal is not really that much a geometry and we've got
quite some traversals now. Makes sense to keep them separate in
the name of source structure clarity.
---
 intern/cycles/kernel/CMakeLists.txt              |  40 +-
 intern/cycles/kernel/bvh/bvh.h                   | 424 +++++++++++++++
 intern/cycles/kernel/bvh/bvh_nodes.h             | 656 +++++++++++++++++++++++
 intern/cycles/kernel/bvh/bvh_shadow.h            | 386 +++++++++++++
 intern/cycles/kernel/bvh/bvh_subsurface.h        | 266 +++++++++
 intern/cycles/kernel/bvh/bvh_traversal.h         | 428 +++++++++++++++
 intern/cycles/kernel/bvh/bvh_volume.h            | 324 +++++++++++
 intern/cycles/kernel/bvh/bvh_volume_all.h        | 397 ++++++++++++++
 intern/cycles/kernel/bvh/qbvh_nodes.h            | 433 +++++++++++++++
 intern/cycles/kernel/bvh/qbvh_shadow.h           | 449 ++++++++++++++++
 intern/cycles/kernel/bvh/qbvh_subsurface.h       | 299 +++++++++++
 intern/cycles/kernel/bvh/qbvh_traversal.h        | 465 ++++++++++++++++
 intern/cycles/kernel/bvh/qbvh_volume.h           | 374 +++++++++++++
 intern/cycles/kernel/bvh/qbvh_volume_all.h       | 446 +++++++++++++++
 intern/cycles/kernel/geom/geom.h                 |   9 -
 intern/cycles/kernel/geom/geom_bvh.h             | 417 --------------
 intern/cycles/kernel/geom/geom_bvh_nodes.h       | 656 -----------------------
 intern/cycles/kernel/geom/geom_bvh_shadow.h      | 386 -------------
 intern/cycles/kernel/geom/geom_bvh_subsurface.h  | 266 ---------
 intern/cycles/kernel/geom/geom_bvh_traversal.h   | 428 ---------------
 intern/cycles/kernel/geom/geom_bvh_volume.h      | 324 -----------
 intern/cycles/kernel/geom/geom_bvh_volume_all.h  | 397 --------------
 intern/cycles/kernel/geom/geom_qbvh.h            | 433 ---------------
 intern/cycles/kernel/geom/geom_qbvh_shadow.h     | 449 ----------------
 intern/cycles/kernel/geom/geom_qbvh_subsurface.h | 299 -----------
 intern/cycles/kernel/geom/geom_qbvh_traversal.h  | 465 ----------------
 intern/cycles/kernel/geom/geom_qbvh_volume.h     | 374 -------------
 intern/cycles/kernel/geom/geom_qbvh_volume_all.h | 446 ---------------
 intern/cycles/kernel/kernel_path.h               |   1 +
 intern/cycles/kernel/kernels/opencl/kernel.cl    |   1 +
 intern/cycles/kernel/osl/osl_services.cpp        |   1 +
 intern/cycles/kernel/split/kernel_split_common.h |   1 +
 32 files changed, 5377 insertions(+), 5363 deletions(-)
 create mode 100644 intern/cycles/kernel/bvh/bvh.h
 create mode 100644 intern/cycles/kernel/bvh/bvh_nodes.h
 create mode 100644 intern/cycles/kernel/bvh/bvh_shadow.h
 create mode 100644 intern/cycles/kernel/bvh/bvh_subsurface.h
 create mode 100644 intern/cycles/kernel/bvh/bvh_traversal.h
 create mode 100644 intern/cycles/kernel/bvh/bvh_volume.h
 create mode 100644 intern/cycles/kernel/bvh/bvh_volume_all.h
 create mode 100644 intern/cycles/kernel/bvh/qbvh_nodes.h
 create mode 100644 intern/cycles/kernel/bvh/qbvh_shadow.h
 create mode 100644 intern/cycles/kernel/bvh/qbvh_subsurface.h
 create mode 100644 intern/cycles/kernel/bvh/qbvh_traversal.h
 create mode 100644 intern/cycles/kernel/bvh/qbvh_volume.h
 create mode 100644 intern/cycles/kernel/bvh/qbvh_volume_all.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh_nodes.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh_shadow.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh_subsurface.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh_traversal.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh_volume.h
 delete mode 100644 intern/cycles/kernel/geom/geom_bvh_volume_all.h
 delete mode 100644 intern/cycles/kernel/geom/geom_qbvh.h
 delete mode 100644 intern/cycles/kernel/geom/geom_qbvh_shadow.h
 delete mode 100644 intern/cycles/kernel/geom/geom_qbvh_subsurface.h
 delete mode 100644 intern/cycles/kernel/geom/geom_qbvh_traversal.h
 delete mode 100644 intern/cycles/kernel/geom/geom_qbvh_volume.h
 delete mode 100644 intern/cycles/kernel/geom/geom_qbvh_volume_all.h

(limited to 'intern')

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3c2f7747f34..3f0917bb992 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -28,6 +28,22 @@ set(SRC
 	kernels/cuda/kernel.cu
 )
 
+set(SRC_BVH_HEADERS
+	bvh/bvh.h
+	bvh/bvh_nodes.h
+	bvh/bvh_shadow.h
+	bvh/bvh_subsurface.h
+	bvh/bvh_traversal.h
+	bvh/bvh_volume.h
+	bvh/bvh_volume_all.h
+	bvh/qbvh_nodes.h
+	bvh/qbvh_shadow.h
+	bvh/qbvh_subsurface.h
+	bvh/qbvh_traversal.h
+	bvh/qbvh_volume.h
+	bvh/qbvh_volume_all.h
+)
+
 set(SRC_HEADERS
 	kernel_accumulate.h
 	kernel_bake.h
@@ -140,24 +156,11 @@ set(SRC_SVM_HEADERS
 set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
-	geom/geom_bvh.h
-	geom/geom_bvh_nodes.h
-	geom/geom_bvh_shadow.h
-	geom/geom_bvh_subsurface.h
-	geom/geom_bvh_traversal.h
-	geom/geom_bvh_volume.h
-	geom/geom_bvh_volume_all.h
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_object.h
 	geom/geom_primitive.h
-	geom/geom_qbvh.h
-	geom/geom_qbvh_shadow.h
-	geom/geom_qbvh_subsurface.h
-	geom/geom_qbvh_traversal.h
-	geom/geom_qbvh_volume.h
-	geom/geom_qbvh_volume_all.h
 	geom/geom_triangle.h
 	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
@@ -213,7 +216,14 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernels/cuda/kernel.cu
+		${SRC_HEADERS}
+		${SRC_BVH_HEADERS}
+		${SRC_SVM_HEADERS}
+		${SRC_GEOM_HEADERS}
+		${SRC_CLOSURE_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)
 
 	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
@@ -313,6 +323,7 @@ add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
@@ -347,6 +358,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteratio
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
new file mode 100644
index 00000000000..b1802596c5a
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -0,0 +1,424 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* BVH
+ *
+ * Bounding volume hierarchy for ray tracing. We compile different variations
+ * of the same BVH traversal function for faster rendering when some types of
+ * primitives are not needed, using #includes to work around the lack of
+ * C++ templates in OpenCL.
+ *
+ * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
+ * the code has been extended and modified to support more primitives and work
+ * with CPU/CUDA/OpenCL. */
+
+CCL_NAMESPACE_BEGIN
+
+/* Don't inline intersect functions on GPU, this is faster */
+#ifdef __KERNEL_GPU__
+#  define ccl_device_intersect ccl_device_noinline
+#else
+#  define ccl_device_intersect ccl_device_inline
+#endif
+
+/* bottom-most stack entry, indicating the end of traversal */
+#define ENTRYPOINT_SENTINEL 0x76543210
+
+/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
+#define BVH_STACK_SIZE 192
+#define BVH_QSTACK_SIZE 384
+
+/* BVH intersection function variations */
+
+#define BVH_INSTANCING			1
+#define BVH_MOTION				2
+#define BVH_HAIR				4
+#define BVH_HAIR_MINIMUM_WIDTH	8
+
+#define BVH_NAME_JOIN(x,y) x ## _ ## y
+#define BVH_NAME_EVAL(x,y) BVH_NAME_JOIN(x,y)
+#define BVH_FUNCTION_FULL_NAME(prefix) BVH_NAME_EVAL(prefix, BVH_FUNCTION_NAME)
+
+#define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+/* Debugging heleprs */
+#ifdef __KERNEL_DEBUG__
+#  define BVH_DEBUG_INIT() \
+	do { \
+		isect->num_traversal_steps = 0; \
+		isect->num_traversed_instances = 0; \
+	} while(0)
+#  define BVH_DEBUG_NEXT_STEP() \
+	do { \
+		++isect->num_traversal_steps; \
+	} while(0)
+#  define BVH_DEBUG_NEXT_INSTANCE() \
+	do { \
+		++isect->num_traversed_instances; \
+	} while(0)
+#else  /* __KERNEL_DEBUG__ */
+#  define BVH_DEBUG_INIT()
+#  define BVH_DEBUG_NEXT_STEP()
+#  define BVH_DEBUG_NEXT_INSTANCE()
+#endif  /* __KERNEL_DEBUG__ */
+
+
+/* Common QBVH functions. */
+#ifdef __QBVH__
+#  include "qbvh_nodes.h"
+#endif
+
+/* Regular BVH traversal */
+
+#include "bvh_nodes.h"
+
+#define BVH_FUNCTION_NAME bvh_intersect
+#define BVH_FUNCTION_FEATURES 0
+#include "bvh_traversal.h"
+
+#if defined(__INSTANCING__)
+#  define BVH_FUNCTION_NAME bvh_intersect_instancing
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#  include "bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__)
+#  define BVH_FUNCTION_NAME bvh_intersect_hair
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#  include "bvh_traversal.h"
+#endif
+
+#if defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_motion
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#  include "bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#  include "bvh_traversal.h"
+#endif
+
+/* Subsurface scattering BVH traversal */
+
+#if defined(__SUBSURFACE__)
+#  define BVH_FUNCTION_NAME bvh_intersect_subsurface
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
+#  include "bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
+#  define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
+#  include "bvh_subsurface.h"
+#endif
+
+/* Volume BVH traversal */
+
+#if defined(__VOLUME__)
+#  define BVH_FUNCTION_NAME bvh_intersect_volume
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
+#  include "bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#  define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#  include "bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+#  include "bvh_volume.h"
+#endif
+
+/* Record all intersections - Shadow BVH traversal */
+
+#if defined(__SHADOW_RECORD_ALL__)
+#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
+#  define BVH_FUNCTION_FEATURES 0
+#  include "bvh_shadow.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
+#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#  include "bvh_shadow.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
+#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#  include "bvh_shadow.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#  include "bvh_shadow.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+#  include "bvh_shadow.h"
+#endif
+
+/* Record all intersections - Volume BVH traversal  */
+
+#if defined(__VOLUME_RECORD_ALL__)
+#  define BVH_FUNCTION_NAME bvh_intersect_volume_all
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
+#  include "bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
+#  define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#  include "bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+#  define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+#  include "bvh_volume_all.h"
+#endif
+
+#undef BVH_FEATURE
+#undef BVH_NAME_JOIN
+#undef BVH_NAME_EVAL
+#undef BVH_FUNCTION_FULL_NAME
+
+ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+                                          const Ray *ray,
+                                          const uint visibility,
+                                          Intersection *isect,
+                                          uint *lcg_state,
+                                          float difl,
+                                          float extmax)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#  ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#  endif /* __HAIR__ */
+
+		return bvh_intersect_motion(kg, ray, isect, visibility);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#  ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_instancing(kg, ray, isect, visibility);
+#  endif /* __INSTANCING__ */
+
+	return bvh_intersect(kg, ray, isect, visibility);
+#else /* __KERNEL_CPU__ */
+
+#  ifdef __INSTANCING__
+	return bvh_intersect_instancing(kg, ray, isect, visibility);
+#  else
+	return bvh_intersect(kg, ray, isect, visibility);
+#  endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+
+#ifdef __SUBSURFACE__
+ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     SubsurfaceIntersection *ss_isect,
+                                                     int subsurface_object,
+                                                     uint *lcg_state,
+                                                     int max_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+		return bvh_intersect_subsurface_motion(kg,
+		                                       ray,
+		                                       ss_isect,
+		                                       subsurface_object,
+		                                       lcg_state,
+		                                       max_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+	return bvh_intersect_subsurface(kg,
+	                                ray,
+	                                ss_isect,
+	                                subsurface_object,
+	                                lcg_state,
+	                                max_hits);
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+{
+#  ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#    ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+#    endif /* __HAIR__ */
+
+		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+	}
+#  endif /* __OBJECT_MOTION__ */
+
+#  ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+#  endif /* __HAIR__ */
+
+#  ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+#  endif /* __INSTANCING__ */
+
+	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+}
+#endif  /* __SHADOW_RECORD_ALL__ */
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+                                                 const Ray *ray,
+                                                 Intersection *isect,
+                                                 const uint visibility)
+{
+#  ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+		return bvh_intersect_volume_motion(kg, ray, isect, visibility);
+	}
+#  endif /* __OBJECT_MOTION__ */
+#  ifdef __KERNEL_CPU__
+#    ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
+#    endif /* __INSTANCING__ */
+	return bvh_intersect_volume(kg, ray, isect, visibility);
+#  else /* __KERNEL_CPU__ */
+#    ifdef __INSTANCING__
+	return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
+#    else
+	return bvh_intersect_volume(kg, ray, isect, visibility);
+#    endif /* __INSTANCING__ */
+#  endif /* __KERNEL_CPU__ */
+}
+#endif  /* __VOLUME__ */
+
+#ifdef __VOLUME_RECORD_ALL__
+ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint max_hits,
+                                                     const uint visibility)
+{
+#  ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
+	}
+#  endif /* __OBJECT_MOTION__ */
+#  ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
+#  endif /* __INSTANCING__ */
+	return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
+}
+#endif  /* __VOLUME_RECORD_ALL__ */
+
+
+/* Ray offset to avoid self intersection.
+ *
+ * This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. */
+
+ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
+{
+#ifdef __INTERSECTION_REFINE__
+	const float epsilon_f = 1e-5f;
+	/* ideally this should match epsilon_f, but instancing and motion blur
+	 * precision makes it problematic */
+	const float epsilon_test = 1.0f;
+	const int epsilon_i = 32;
+
+	float3 res;
+
+	/* x component */
+	if(fabsf(P.x) < epsilon_test) {
+		res.x = P.x + Ng.x*epsilon_f;
+	}
+	else {
+		uint ix = __float_as_uint(P.x);
+		ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
+		res.x = __uint_as_float(ix);
+	}
+
+	/* y component */
+	if(fabsf(P.y) < epsilon_test) {
+		res.y = P.y + Ng.y*epsilon_f;
+	}
+	else {
+		uint iy = __float_as_uint(P.y);
+		iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
+		res.y = __uint_as_float(iy);
+	}
+
+	/* z component */
+	if(fabsf(P.z) < epsilon_test) {
+		res.z = P.z + Ng.z*epsilon_f;
+	}
+	else {
+		uint iz = __float_as_uint(P.z);
+		iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
+		res.z = __uint_as_float(iz);
+	}
+
+	return res;
+#else
+	const float epsilon_f = 1e-4f;
+	return P + epsilon_f*Ng;
+#endif
+}
+
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+/* ToDo: Move to another file? */
+ccl_device int intersections_compare(const void *a, const void *b)
+{
+	const Intersection *isect_a = (const Intersection*)a;
+	const Intersection *isect_b = (const Intersection*)b;
+
+	if(isect_a->t < isect_b->t)
+		return -1;
+	else if(isect_a->t > isect_b->t)
+		return 1;
+	else
+		return 0;
+}
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
new file mode 100644
index 00000000000..5b0d8785d0e
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright 2011-2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
+// 3-vector which might be faster.
+ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+                                                           int nodeAddr,
+                                                           int child)
+{
+	Transform space;
+	const int child_addr = nodeAddr + child * 3;
+	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
+	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
+	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
+	space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+	return space;
+}
+
+#if !defined(__KERNEL_SSE2__)
+ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+                                                 const float3 P,
+                                                 const float3 idir,
+                                                 const float t,
+                                                 const int nodeAddr,
+                                                 const uint visibility,
+                                                 float dist[2])
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+                                                        const float3 P,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const float difl,
+                                                        const float extmax,
+                                                        const int nodeAddr,
+                                                        const uint visibility,
+                                                        float dist[2])
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	if(difl != 0.0f) {
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        int nodeAddr,
+        int child,
+        float dist[2])
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
+	return tNear <= tFar;
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        const float difl,
+        int nodeAddr,
+        int child,
+        float dist[2])
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
+	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
+	*dist = tNear;
+	if(difl != 0.0f) {
+		/* TODO(sergey): Same as for QBVH, needs a proper use. */
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		return round_down*tNear <= round_up*tFar;
+	}
+	else {
+		return tNear <= tFar;
+	}
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const float3 idir,
+                                                   const float t,
+                                                   const int nodeAddr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const float3 idir,
+                                                          const float t,
+                                                          const float difl,
+                                                          const float extmax,
+                                                          const int nodeAddr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, nodeAddr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, nodeAddr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3 P,
+                                         const float3 dir,
+                                         const float3 idir,
+                                         const float t,
+                                         const int nodeAddr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    idir,
+		                                    t,
+		                                    nodeAddr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  idir,
+		                                  t,
+		                                  nodeAddr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3 P,
+                                                const float3 dir,
+                                                const float3 idir,
+                                                const float t,
+                                                const float difl,
+                                                const float extmax,
+                                                const int nodeAddr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           idir,
+		                                           t,
+		                                           difl,
+		                                           extmax,
+		                                           nodeAddr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         idir,
+		                                         t,
+		                                         difl,
+		                                         extmax,
+		                                         nodeAddr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#else  /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_inline bvh_aligned_node_intersect(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_aligned_node_intersect_robust(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const float difl,
+        const float extmax,
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+
+	if(difl != 0.0f) {
+		float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+		float4 *tminmaxview = (float4*)&tminmax;
+		float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
+		float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const ssef& tnear,
+                                                   const ssef& tfar,
+                                                   const int nodeAddr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	sseb vmask = tNear <= tFar;
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const ssef& tnear,
+                                                          const ssef& tfar,
+                                                          const float difl,
+                                                          const int nodeAddr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(tLowerX, tUpperX);
+	ssef tnear_y = min(tLowerY, tUpperY);
+	ssef tnear_z = min(tLowerZ, tUpperZ);
+	ssef tfar_x = max(tLowerX, tUpperX);
+	ssef tfar_y = max(tLowerY, tUpperY);
+	ssef tfar_z = max(tLowerZ, tUpperZ);
+
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	sseb vmask;
+	if(difl != 0.0f) {
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		vmask = round_down*tNear <= round_up*tFar;
+	}
+	else {
+		vmask = tNear <= tFar;
+	}
+
+	dist[0] = tNear.f[0];
+	dist[1] = tNear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3& P,
+                                         const float3& dir,
+                                         const ssef& tnear,
+                                         const ssef& tfar,
+                                         const ssef& tsplat,
+                                         const ssef Psplat[3],
+                                         const ssef idirsplat[3],
+                                         const shuffle_swap_t shufflexyz[3],
+                                         const int nodeAddr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    tnear,
+		                                    tfar,
+		                                    nodeAddr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  dir,
+		                                  tsplat,
+		                                  Psplat,
+		                                  idirsplat,
+		                                  shufflexyz,
+		                                  nodeAddr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3& P,
+                                                const float3& dir,
+                                                const ssef& tnear,
+                                                const ssef& tfar,
+                                                const ssef& tsplat,
+                                                const ssef Psplat[3],
+                                                const ssef idirsplat[3],
+                                                const shuffle_swap_t shufflexyz[3],
+                                                const float difl,
+                                                const float extmax,
+                                                const int nodeAddr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           tnear,
+		                                           tfar,
+		                                           difl,
+		                                           nodeAddr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         dir,
+		                                         tsplat,
+		                                         Psplat,
+		                                         idirsplat,
+		                                         shufflexyz,
+		                                         difl,
+		                                         extmax,
+		                                         nodeAddr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#endif  /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/bvh/bvh_shadow.h b/intern/cycles/kernel/bvh/bvh_shadow.h
new file mode 100644
index 00000000000..02147d20fee
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_shadow.h
@@ -0,0 +1,386 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#  include "qbvh_shadow.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits,
+                                            uint *num_hits)
+{
+	/* todo:
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif  /* __KERNEL_SSE2__ */
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+#if !defined(__KERNEL_SSE2__)
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               nodeAddr,
+				                               PATH_RAY_SHADOW,
+				                               dist);
+#else // __KERNEL_SSE2__
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               PATH_RAY_SHADOW,
+				                               dist);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);
+
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					while(primAddr < primAddr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+
+						bool hit;
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(p_type) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* shadow ray early termination */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+							{
+								shader = kernel_tex_fetch(__tri_shader, prim);
+							}
+#ifdef __HAIR__
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+#  else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#  endif
+
+					triangle_intersect_precalc(dir, &isect_precalc);
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+#  if defined(__KERNEL_SSE2__)
+					Psplat[0] = ssef(P.x);
+					Psplat[1] = ssef(P.y);
+					Psplat[2] = ssef(P.z);
+
+					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+					tfar = ssef(isect_t);
+#    endif
+					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+
+				triangle_intersect_precalc(dir, &isect_precalc);
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#  endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+#  if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect_t);
+#    endif
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits,
+                                         uint *num_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits,
+		                                    num_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits,
+		                                   num_hits);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
new file mode 100644
index 00000000000..7121c5791df
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -0,0 +1,266 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#  include "qbvh_subsurface.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            SubsurfaceIntersection *ss_isect,
+                                            int subsurface_object,
+                                            uint *lcg_state,
+                                            int max_hits)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object);
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+
+	ss_isect->num_hits = 0;
+
+	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
+	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+		Transform ob_itfm;
+		bvh_instance_motion_push(kg,
+		                         subsurface_object,
+		                         ray,
+		                         &P,
+		                         &dir,
+		                         &idir,
+		                         &isect_t,
+		                         &ob_itfm);
+#else
+		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+#endif
+		object = subsurface_object;
+	}
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+#if !defined(__KERNEL_SSE2__)
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               nodeAddr,
+				                               PATH_RAY_ALL_VISIBILITY,
+				                               dist);
+#else // __KERNEL_SSE2__
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               PATH_RAY_ALL_VISIBILITY,
+				                               dist);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);
+
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+				const int primAddr2 = __float_as_int(leaf.y);
+				const uint type = __float_as_int(leaf.w);
+
+				/* pop */
+				nodeAddr = traversalStack[stackPtr];
+				--stackPtr;
+
+				/* primitive intersection */
+				switch(type & PRIMITIVE_ALL) {
+					case PRIMITIVE_TRIANGLE: {
+						/* intersect ray against primitive */
+						for(; primAddr < primAddr2; primAddr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							triangle_intersect_subsurface(kg,
+							                              &isect_precalc,
+							                              ss_isect,
+							                              P,
+							                              object,
+							                              primAddr,
+							                              isect_t,
+							                              lcg_state,
+							                              max_hits);
+						}
+						break;
+					}
+#if BVH_FEATURE(BVH_MOTION)
+					case PRIMITIVE_MOTION_TRIANGLE: {
+						/* intersect ray against primitive */
+						for(; primAddr < primAddr2; primAddr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							motion_triangle_intersect_subsurface(kg,
+							                                     ss_isect,
+							                                     P,
+							                                     dir,
+							                                     ray->time,
+							                                     object,
+							                                     primAddr,
+							                                     isect_t,
+							                                     lcg_state,
+							                                     max_hits);
+						}
+						break;
+					}
+#endif
+					default: {
+						break;
+					}
+				}
+			}
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+}
+
+ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         SubsurfaceIntersection *ss_isect,
+                                         int subsurface_object,
+                                         uint *lcg_state,
+                                         int max_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    ss_isect,
+		                                    subsurface_object,
+		                                    lcg_state,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   ss_isect,
+		                                   subsurface_object,
+		                                   lcg_state,
+		                                   max_hits);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
new file mode 100644
index 00000000000..36c3398335c
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -0,0 +1,428 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#  include "qbvh_traversal.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#  define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
+#endif
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect,
+                                            const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                            , uint *lcg_state,
+                                            float difl,
+                                            float extmax
+#endif
+                                            )
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	BVH_DEBUG_INIT();
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect->t);
+#  endif
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+#if !defined(__KERNEL_SSE2__)
+#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					traverse_mask = NODE_INTERSECT_ROBUST(kg,
+					                                      P,
+#    if BVH_FEATURE(BVH_HAIR)
+					                                      dir,
+#    endif
+					                                      idir,
+					                                      isect->t,
+					                                      difl,
+					                                      extmax,
+					                                      nodeAddr,
+					                                      visibility,
+					                                      dist);
+				}
+				else
+#  endif
+				{
+					traverse_mask = NODE_INTERSECT(kg,
+					                               P,
+#    if BVH_FEATURE(BVH_HAIR)
+					                               dir,
+#    endif
+					                               idir,
+					                               isect->t,
+					                               nodeAddr,
+					                               visibility,
+					                               dist);
+				}
+#else // __KERNEL_SSE2__
+#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					traverse_mask = NODE_INTERSECT_ROBUST(kg,
+					                                      P,
+					                                      dir,
+#    if BVH_FEATURE(BVH_HAIR)
+					                                      tnear,
+					                                      tfar,
+#    endif
+					                                      tsplat,
+					                                      Psplat,
+					                                      idirsplat,
+					                                      shufflexyz,
+					                                      difl,
+					                                      extmax,
+					                                      nodeAddr,
+					                                      visibility,
+					                                      dist);
+				}
+				else
+#  endif
+				{
+					traverse_mask = NODE_INTERSECT(kg,
+					                               P,
+					                               dir,
+#    if BVH_FEATURE(BVH_HAIR)
+					                               tnear,
+					                               tfar,
+#    endif
+					                               tsplat,
+					                               Psplat,
+					                               idirsplat,
+					                               shufflexyz,
+					                               nodeAddr,
+					                               visibility,
+					                               dist);
+				}
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);
+
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+				BVH_DEBUG_NEXT_STEP();
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+									/* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#  endif
+#else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#endif
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+									/* shadow ray early termination */
+#  if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#    endif
+#  else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#  endif
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								else
+									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								if(hit) {
+									/* shadow ray early termination */
+#  if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#    endif
+#  else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#  endif
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+#  if defined(__KERNEL_SSE2__)
+					Psplat[0] = ssef(P.x);
+					Psplat[1] = ssef(P.y);
+					Psplat[2] = ssef(P.z);
+
+					tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+					tfar = ssef(isect->t);
+#    endif
+
+					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+					BVH_DEBUG_NEXT_INSTANCE();
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* instance pop */
+#  if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+#  if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect->t);
+#    endif
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect,
+                                         const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                         , uint *lcg_state,
+                                         float difl,
+                                         float extmax
+#endif
+                                         )
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect,
+		                                    visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                    , lcg_state,
+		                                    difl,
+		                                    extmax
+#endif
+		                                    );
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect,
+		                                   visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                   , lcg_state,
+		                                   difl,
+		                                   extmax
+#endif
+		                                   );
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
new file mode 100644
index 00000000000..8b44c66f7aa
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -0,0 +1,324 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#  include "qbvh_volume.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect,
+                                            const uint visibility)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect->t);
+#  endif
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+#if !defined(__KERNEL_SSE2__)
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect->t,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
+#else // __KERNEL_SSE2__
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);
+
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+							}
+							break;
+						}
+#endif
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#  if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+#  if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+						tfar = ssef(isect->t);
+#    endif
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* instance pop */
+#  if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+#  if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect->t);
+#    endif
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect,
+                                         const uint visibility)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect,
+		                                    visibility);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect,
+		                                   visibility);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
new file mode 100644
index 00000000000..445243c4e5c
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -0,0 +1,397 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#  include "qbvh_volume_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits,
+                                            const uint visibility)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				int nodeAddrChild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+#if !defined(__KERNEL_SSE2__)
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
+#else // __KERNEL_SSE2__
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               nodeAddr,
+				                               visibility,
+				                               dist);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.z);
+				nodeAddrChild1 = __float_as_int(cnodes.w);
+
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool closestChild1 = (dist[1] < dist[0]);
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					bool hit;
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#  else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#  endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#  if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#  endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#  if BVH_FEATURE(BVH_INSTANCING)
+#    if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#    else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#    endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#  endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_MOTION */
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#  if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+#  else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#  endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+#  if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+						tfar = ssef(isect_t);
+#    endif
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#  endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+#  if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect_t);
+#    endif
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#  endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits,
+                                         const uint visibility)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits,
+		                                    visibility);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits,
+		                                   visibility);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
new file mode 100644
index 00000000000..5eda3213acb
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+struct QBVHStackItem {
+	int addr;
+	float dist;
+};
+
+/* TOOD(sergey): Investigate if using intrinsics helps for both
+ * stack item swap and float comparison.
+ */
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
+                                      QBVHStackItem *__restrict b)
+{
+	QBVHStackItem tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3,
+                                       QBVHStackItem *__restrict s4)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
+	if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
+	if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
+                                                  const ssef& tnear,
+                                                  const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                                  const sse3f& org_idir,
+#else
+                                                  const sse3f& org,
+#endif
+                                                  const sse3f& idir,
+                                                  const int near_x,
+                                                  const int near_y,
+                                                  const int near_z,
+                                                  const int far_x,
+                                                  const int far_y,
+                                                  const int far_z,
+                                                  const int nodeAddr,
+                                                  ssef *__restrict dist)
+{
+	const int offset = nodeAddr + 1;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
+#endif
+
+#ifdef __KERNEL_SSE41__
+	const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear));
+	const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar));
+	const sseb vmask = cast(tNear) > cast(tFar);
+	int mask = (int)movemask(vmask)^0xf;
+#else
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = tNear <= tFar;
+	int mask = (int)movemask(vmask);
+#endif
+	*dist = tNear;
+	return mask;
+}
+
+ccl_device_inline int qbvh_aligned_node_intersect_robust(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#else
+        const sse3f& P,
+#endif
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        const float difl,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr + 1;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
+#endif
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = round_down*tNear <= round_up*tFar;
+	*dist = tNear;
+	return (int)movemask(vmask);
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int qbvh_unaligned_node_intersect(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& org_idir,
+#endif
+        const sse3f& org,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+	const ssef nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const ssef tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const ssef tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear_x = mini(tlower_x, tupper_x);
+	const ssef tnear_y = mini(tlower_y, tupper_y);
+	const ssef tnear_z = mini(tlower_z, tupper_z);
+	const ssef tfar_x = maxi(tlower_x, tupper_x);
+	const ssef tfar_y = maxi(tlower_y, tupper_y);
+	const ssef tfar_z = maxi(tlower_z, tupper_z);
+	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = tNear <= tFar;
+	*dist = tNear;
+	return movemask(vmask);
+#else
+	const ssef tnear_x = min(tlower_x, tupper_x);
+	const ssef tnear_y = min(tlower_y, tupper_y);
+	const ssef tnear_z = min(tlower_z, tupper_z);
+	const ssef tfar_x = max(tlower_x, tupper_x);
+	const ssef tfar_y = max(tlower_y, tupper_y);
+	const ssef tfar_z = max(tlower_z, tupper_z);
+	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = tNear <= tFar;
+	*dist = tNear;
+	return movemask(vmask);
+#endif
+}
+
+ccl_device_inline int qbvh_unaligned_node_intersect_robust(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#endif
+        const sse3f& P,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        const float difl,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+	const ssef nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const ssef tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const ssef tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear_x = mini(tlower_x, tupper_x);
+	const ssef tnear_y = mini(tlower_y, tupper_y);
+	const ssef tnear_z = mini(tlower_z, tupper_z);
+	const ssef tfar_x = maxi(tlower_x, tupper_x);
+	const ssef tfar_y = maxi(tlower_y, tupper_y);
+	const ssef tfar_z = maxi(tlower_z, tupper_z);
+#else
+	const ssef tnear_x = min(tlower_x, tupper_x);
+	const ssef tnear_y = min(tlower_y, tupper_y);
+	const ssef tnear_z = min(tlower_z, tupper_z);
+	const ssef tfar_x = max(tlower_x, tupper_x);
+	const ssef tfar_y = max(tlower_y, tupper_y);
+	const ssef tfar_z = max(tlower_z, tupper_z);
+#endif
+	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
+	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = round_down*tNear <= round_up*tFar;
+	*dist = tNear;
+	return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int qbvh_node_intersect(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& org_idir,
+#endif
+        const sse3f& org,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return qbvh_unaligned_node_intersect(kg,
+		                                     tnear,
+		                                     tfar,
+#ifdef __KERNEL_AVX2__
+		                                     org_idir,
+#endif
+		                                     org,
+		                                     dir,
+		                                     idir,
+		                                     near_x, near_y, near_z,
+		                                     far_x, far_y, far_z,
+		                                     nodeAddr,
+		                                     dist);
+	}
+	else {
+		return qbvh_aligned_node_intersect(kg,
+		                                   tnear,
+		                                   tfar,
+#ifdef __KERNEL_AVX2__
+		                                   org_idir,
+#else
+		                                   org,
+#endif
+		                                   idir,
+		                                   near_x, near_y, near_z,
+		                                   far_x, far_y, far_z,
+		                                   nodeAddr,
+		                                   dist);
+	}
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(
+        KernelGlobals *__restrict kg,
+        const ssef& tnear,
+        const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#endif
+        const sse3f& P,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int nodeAddr,
+        const float difl,
+        ssef *__restrict dist)
+{
+	const int offset = nodeAddr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return qbvh_unaligned_node_intersect_robust(kg,
+		                                            tnear,
+		                                            tfar,
+#ifdef __KERNEL_AVX2__
+		                                            P_idir,
+#endif
+		                                            P,
+		                                            dir,
+		                                            idir,
+		                                            near_x, near_y, near_z,
+		                                            far_x, far_y, far_z,
+		                                            nodeAddr,
+		                                            difl,
+		                                            dist);
+	}
+	else {
+		return qbvh_aligned_node_intersect_robust(kg,
+		                                          tnear,
+		                                          tfar,
+#ifdef __KERNEL_AVX2__
+		                                          P_idir,
+#else
+		                                          P,
+#endif
+		                                          idir,
+		                                          near_x, near_y, near_z,
+		                                          far_x, far_y, far_z,
+		                                          nodeAddr,
+		                                          difl,
+		                                          dist);
+	}
+}
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow.h b/intern/cycles/kernel/bvh/qbvh_shadow.h
new file mode 100644
index 00000000000..e5e611a0d47
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_shadow.h
@@ -0,0 +1,449 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+	/* TODO(sergey):
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(tmax);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+					continue;
+				}
+#endif
+
+				ssef dist;
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
+#ifdef __KERNEL_AVX2__
+				                                   P_idir4,
+#endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#  endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+					continue;
+				}
+#endif
+
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					while(primAddr < primAddr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+
+						bool hit;
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(p_type) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* Shadow ray early termination. */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+							{
+								shader = kernel_tex_fetch(__tri_shader, prim);
+							}
+#ifdef __HAIR__
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+#  else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#  endif
+
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#  endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(tmax);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
new file mode 100644
index 00000000000..4adaf9c8f3d
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -0,0 +1,299 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             SubsurfaceIntersection *ss_isect,
+                                             int subsurface_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - SSE for hair.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object);
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+
+	ss_isect->num_hits = 0;
+
+	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
+	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+		Transform ob_itfm;
+		bvh_instance_motion_push(kg,
+		                         subsurface_object,
+		                         ray,
+		                         &P,
+		                         &dir,
+		                         &idir,
+		                         &isect_t,
+		                         &ob_itfm);
+#else
+		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+#endif
+		object = subsurface_object;
+	}
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return;
+	}
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
+#ifdef __KERNEL_AVX2__
+				                                   P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);
+
+				if(traverseChild != 0) {
+					float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+				int primAddr2 = __float_as_int(leaf.y);
+				const uint type = __float_as_int(leaf.w);
+
+				/* Pop. */
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+
+				/* Primitive intersection. */
+				switch(type & PRIMITIVE_ALL) {
+					case PRIMITIVE_TRIANGLE: {
+						/* Intersect ray against primitive, */
+						for(; primAddr < primAddr2; primAddr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							triangle_intersect_subsurface(kg,
+							                              &isect_precalc,
+							                              ss_isect,
+							                              P,
+							                              object,
+							                              primAddr,
+							                              isect_t,
+							                              lcg_state,
+							                              max_hits);
+						}
+						break;
+					}
+#if BVH_FEATURE(BVH_MOTION)
+					case PRIMITIVE_MOTION_TRIANGLE: {
+						/* Intersect ray against primitive. */
+						for(; primAddr < primAddr2; primAddr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							motion_triangle_intersect_subsurface(kg,
+							                                     ss_isect,
+							                                     P,
+							                                     dir,
+							                                     ray->time,
+							                                     object,
+							                                     primAddr,
+							                                     isect_t,
+							                                     lcg_state,
+							                                     max_hits);
+						}
+						break;
+					}
+#endif
+					default:
+						break;
+				}
+			}
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
new file mode 100644
index 00000000000..24bf85f46c8
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -0,0 +1,465 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#  define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	traversalStack[0].dist = -FLT_MAX;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+	float nodeDist = -FLT_MAX;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	BVH_DEBUG_INIT();
+
+	ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+
+				if(UNLIKELY(nodeDist > isect->t)
+#ifdef __VISIBILITY_FLAG__
+				   || (__float_as_uint(inodes.x) & visibility) == 0)
+#endif
+				{
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+					continue;
+				}
+
+				int traverseChild;
+				ssef dist;
+
+				BVH_DEBUG_NEXT_STEP();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					traverseChild = NODE_INTERSECT_ROBUST(kg,
+					                                      tnear,
+					                                      tfar,
+#  ifdef __KERNEL_AVX2__
+					                                      P_idir4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                                      org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+					                                      dir4,
+#  endif
+					                                      idir4,
+					                                      near_x, near_y, near_z,
+					                                      far_x, far_y, far_z,
+					                                      nodeAddr,
+					                                      difl,
+					                                      &dist);
+				}
+				else
+#endif  /* BVH_HAIR_MINIMUM_WIDTH */
+				{
+					traverseChild = NODE_INTERSECT(kg,
+					                               tnear,
+					                               tfar,
+#ifdef __KERNEL_AVX2__
+					                               P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                               org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                               dir4,
+#endif
+					                               idir4,
+					                               near_x, near_y, near_z,
+					                               far_x, far_y, far_z,
+					                               nodeAddr,
+					                               &dist);
+				}
+
+				if(traverseChild != 0) {
+					float4 cnodes;
+					/* TODO(sergey): Investigate whether moving cnodes upwards
+					 * gives a speedup (will be different cache pattern but will
+					 * avoid extra check here),
+					 */
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					float d0 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						nodeDist = d0;
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							nodeDist = d1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							nodeDist = d0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						nodeDist = traversalStack[stackPtr].dist;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				nodeDist = traversalStack[stackPtr].dist;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+
+#ifdef __VISIBILITY_FLAG__
+				if(UNLIKELY((nodeDist > isect->t) ||
+				            ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+				if(UNLIKELY((nodeDist > isect->t)))
+#endif
+				{
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+					continue;
+				}
+
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								else
+									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								if(hit) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_itfm);
+#  else
+					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist);
+#  endif
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+					traversalStack[stackPtr].dist = -FLT_MAX;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+					BVH_DEBUG_NEXT_INSTANCE();
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			nodeDist = traversalStack[stackPtr].dist;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
new file mode 100644
index 00000000000..da21ede9e12
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -0,0 +1,374 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+					continue;
+				}
+#endif
+
+				ssef dist;
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
+#ifdef __KERNEL_AVX2__
+				                                   P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#  if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
new file mode 100644
index 00000000000..8a31775fae3
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -0,0 +1,446 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             const uint visibility)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+					continue;
+				}
+#endif
+
+				ssef dist;
+				int traverseChild = NODE_INTERSECT(kg,
+				                                   tnear,
+				                                   tfar,
+#ifdef __KERNEL_AVX2__
+				                                   P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                   dir4,
+#endif
+				                                   idir4,
+				                                   near_x, near_y, near_z,
+				                                   far_x, far_y, far_z,
+				                                   nodeAddr,
+				                                   &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+					bool hit;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#  else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#  endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#  if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#  endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#  if BVH_FEATURE(BVH_INSTANCING)
+#    if BVH_FEATURE(BVH_MOTION)
+										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
+#    else
+										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
+#    endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#  endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#  if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+#  else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#  endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+#  ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#  if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+#  else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#  endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 33e91d1ee44..d2c7edb11ea 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -15,14 +15,6 @@
  * limitations under the License.
  */
 
-/* bottom-most stack entry, indicating the end of traversal */
-#define ENTRYPOINT_SENTINEL 0x76543210
-
-/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
-#define BVH_STACK_SIZE 192
-#define BVH_QSTACK_SIZE 384
-#define TRI_NODE_SIZE 3
-
 #include "geom_attribute.h"
 #include "geom_object.h"
 #include "geom_triangle.h"
@@ -32,5 +24,4 @@
 #include "geom_curve.h"
 #include "geom_volume.h"
 #include "geom_primitive.h"
-#include "geom_bvh.h"
 
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
deleted file mode 100644
index f8d563f0afa..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* BVH
- *
- * Bounding volume hierarchy for ray tracing. We compile different variations
- * of the same BVH traversal function for faster rendering when some types of
- * primitives are not needed, using #includes to work around the lack of
- * C++ templates in OpenCL.
- *
- * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
- * the code has been extended and modified to support more primitives and work
- * with CPU/CUDA/OpenCL. */
-
-CCL_NAMESPACE_BEGIN
-
-/* Don't inline intersect functions on GPU, this is faster */
-#ifdef __KERNEL_GPU__
-#  define ccl_device_intersect ccl_device_noinline
-#else
-#  define ccl_device_intersect ccl_device_inline
-#endif
-
-/* BVH intersection function variations */
-
-#define BVH_INSTANCING			1
-#define BVH_MOTION				2
-#define BVH_HAIR				4
-#define BVH_HAIR_MINIMUM_WIDTH	8
-
-#define BVH_NAME_JOIN(x,y) x ## _ ## y
-#define BVH_NAME_EVAL(x,y) BVH_NAME_JOIN(x,y)
-#define BVH_FUNCTION_FULL_NAME(prefix) BVH_NAME_EVAL(prefix, BVH_FUNCTION_NAME)
-
-#define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-/* Debugging heleprs */
-#ifdef __KERNEL_DEBUG__
-#  define BVH_DEBUG_INIT() \
-	do { \
-		isect->num_traversal_steps = 0; \
-		isect->num_traversed_instances = 0; \
-	} while(0)
-#  define BVH_DEBUG_NEXT_STEP() \
-	do { \
-		++isect->num_traversal_steps; \
-	} while(0)
-#  define BVH_DEBUG_NEXT_INSTANCE() \
-	do { \
-		++isect->num_traversed_instances; \
-	} while(0)
-#else  /* __KERNEL_DEBUG__ */
-#  define BVH_DEBUG_INIT()
-#  define BVH_DEBUG_NEXT_STEP()
-#  define BVH_DEBUG_NEXT_INSTANCE()
-#endif  /* __KERNEL_DEBUG__ */
-
-
-/* Common QBVH functions. */
-#ifdef __QBVH__
-#  include "geom_qbvh.h"
-#endif
-
-/* Regular BVH traversal */
-
-#include "geom_bvh_nodes.h"
-
-#define BVH_FUNCTION_NAME bvh_intersect
-#define BVH_FUNCTION_FEATURES 0
-#include "geom_bvh_traversal.h"
-
-#if defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "geom_bvh_traversal.h"
-#endif
-
-#if defined(__HAIR__)
-#  define BVH_FUNCTION_NAME bvh_intersect_hair
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "geom_bvh_traversal.h"
-#endif
-
-#if defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "geom_bvh_traversal.h"
-#endif
-
-#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "geom_bvh_traversal.h"
-#endif
-
-/* Subsurface scattering BVH traversal */
-
-#if defined(__SUBSURFACE__)
-#  define BVH_FUNCTION_NAME bvh_intersect_subsurface
-#  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "geom_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-#  define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#  include "geom_bvh_subsurface.h"
-#endif
-
-/* Volume BVH traversal */
-
-#if defined(__VOLUME__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume
-#  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "geom_bvh_volume.h"
-#endif
-
-#if defined(__VOLUME__) && defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "geom_bvh_volume.h"
-#endif
-
-#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#  include "geom_bvh_volume.h"
-#endif
-
-/* Record all intersections - Shadow BVH traversal */
-
-#if defined(__SHADOW_RECORD_ALL__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
-#  define BVH_FUNCTION_FEATURES 0
-#  include "geom_bvh_shadow.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "geom_bvh_shadow.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "geom_bvh_shadow.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "geom_bvh_shadow.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#  include "geom_bvh_shadow.h"
-#endif
-
-/* Record all intersections - Volume BVH traversal  */
-
-#if defined(__VOLUME_RECORD_ALL__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_all
-#  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "geom_bvh_volume_all.h"
-#endif
-
-#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "geom_bvh_volume_all.h"
-#endif
-
-#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#  include "geom_bvh_volume_all.h"
-#endif
-
-#undef BVH_FEATURE
-#undef BVH_NAME_JOIN
-#undef BVH_NAME_EVAL
-#undef BVH_FUNCTION_FULL_NAME
-
-ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
-                                          const Ray *ray,
-                                          const uint visibility,
-                                          Intersection *isect,
-                                          uint *lcg_state,
-                                          float difl,
-                                          float extmax)
-{
-#ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-#  ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
-#  endif /* __HAIR__ */
-
-		return bvh_intersect_motion(kg, ray, isect, visibility);
-	}
-#endif /* __OBJECT_MOTION__ */
-
-#ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_CPU__
-
-#  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_instancing(kg, ray, isect, visibility);
-#  endif /* __INSTANCING__ */
-
-	return bvh_intersect(kg, ray, isect, visibility);
-#else /* __KERNEL_CPU__ */
-
-#  ifdef __INSTANCING__
-	return bvh_intersect_instancing(kg, ray, isect, visibility);
-#  else
-	return bvh_intersect(kg, ray, isect, visibility);
-#  endif /* __INSTANCING__ */
-
-#endif /* __KERNEL_CPU__ */
-}
-
-#ifdef __SUBSURFACE__
-ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
-                                                     const Ray *ray,
-                                                     SubsurfaceIntersection *ss_isect,
-                                                     int subsurface_object,
-                                                     uint *lcg_state,
-                                                     int max_hits)
-{
-#ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-		return bvh_intersect_subsurface_motion(kg,
-		                                       ray,
-		                                       ss_isect,
-		                                       subsurface_object,
-		                                       lcg_state,
-		                                       max_hits);
-	}
-#endif /* __OBJECT_MOTION__ */
-	return bvh_intersect_subsurface(kg,
-	                                ray,
-	                                ss_isect,
-	                                subsurface_object,
-	                                lcg_state,
-	                                max_hits);
-}
-#endif
-
-#ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
-{
-#  ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-#    ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
-#    endif /* __HAIR__ */
-
-		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
-	}
-#  endif /* __OBJECT_MOTION__ */
-
-#  ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
-#  endif /* __HAIR__ */
-
-#  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
-#  endif /* __INSTANCING__ */
-
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
-}
-#endif  /* __SHADOW_RECORD_ALL__ */
-
-#ifdef __VOLUME__
-ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
-                                                 const Ray *ray,
-                                                 Intersection *isect,
-                                                 const uint visibility)
-{
-#  ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-		return bvh_intersect_volume_motion(kg, ray, isect, visibility);
-	}
-#  endif /* __OBJECT_MOTION__ */
-#  ifdef __KERNEL_CPU__
-#    ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-	return bvh_intersect_volume(kg, ray, isect, visibility);
-#  else /* __KERNEL_CPU__ */
-#    ifdef __INSTANCING__
-	return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#    else
-	return bvh_intersect_volume(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-#  endif /* __KERNEL_CPU__ */
-}
-#endif  /* __VOLUME__ */
-
-#ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
-                                                     const Ray *ray,
-                                                     Intersection *isect,
-                                                     const uint max_hits,
-                                                     const uint visibility)
-{
-#  ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
-	}
-#  endif /* __OBJECT_MOTION__ */
-#  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
-#  endif /* __INSTANCING__ */
-	return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
-}
-#endif  /* __VOLUME_RECORD_ALL__ */
-
-
-/* Ray offset to avoid self intersection.
- *
- * This function should be used to compute a modified ray start position for
- * rays leaving from a surface. */
-
-ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
-{
-#ifdef __INTERSECTION_REFINE__
-	const float epsilon_f = 1e-5f;
-	/* ideally this should match epsilon_f, but instancing and motion blur
-	 * precision makes it problematic */
-	const float epsilon_test = 1.0f;
-	const int epsilon_i = 32;
-
-	float3 res;
-
-	/* x component */
-	if(fabsf(P.x) < epsilon_test) {
-		res.x = P.x + Ng.x*epsilon_f;
-	}
-	else {
-		uint ix = __float_as_uint(P.x);
-		ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
-		res.x = __uint_as_float(ix);
-	}
-
-	/* y component */
-	if(fabsf(P.y) < epsilon_test) {
-		res.y = P.y + Ng.y*epsilon_f;
-	}
-	else {
-		uint iy = __float_as_uint(P.y);
-		iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
-		res.y = __uint_as_float(iy);
-	}
-
-	/* z component */
-	if(fabsf(P.z) < epsilon_test) {
-		res.z = P.z + Ng.z*epsilon_f;
-	}
-	else {
-		uint iz = __float_as_uint(P.z);
-		iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
-		res.z = __uint_as_float(iz);
-	}
-
-	return res;
-#else
-	const float epsilon_f = 1e-4f;
-	return P + epsilon_f*Ng;
-#endif
-}
-
-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
-/* ToDo: Move to another file? */
-ccl_device int intersections_compare(const void *a, const void *b)
-{
-	const Intersection *isect_a = (const Intersection*)a;
-	const Intersection *isect_b = (const Intersection*)b;
-
-	if(isect_a->t < isect_b->t)
-		return -1;
-	else if(isect_a->t > isect_b->t)
-		return 1;
-	else
-		return 0;
-}
-#endif
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_nodes.h b/intern/cycles/kernel/geom/geom_bvh_nodes.h
deleted file mode 100644
index 5b0d8785d0e..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh_nodes.h
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
- * Copyright 2011-2016, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
-// 3-vector which might be faster.
-ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
-                                                           int nodeAddr,
-                                                           int child)
-{
-	Transform space;
-	const int child_addr = nodeAddr + child * 3;
-	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
-	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
-	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
-	space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-	return space;
-}
-
-#if !defined(__KERNEL_SSE2__)
-ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                 const float3 P,
-                                                 const float3 idir,
-                                                 const float t,
-                                                 const int nodeAddr,
-                                                 const uint visibility,
-                                                 float dist[2])
-{
-
-	/* fetch node data */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-	/* intersect ray against child nodes */
-	float c0lox = (node0.x - P.x) * idir.x;
-	float c0hix = (node0.z - P.x) * idir.x;
-	float c0loy = (node1.x - P.y) * idir.y;
-	float c0hiy = (node1.z - P.y) * idir.y;
-	float c0loz = (node2.x - P.z) * idir.z;
-	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-	float c1lox = (node0.y - P.x) * idir.x;
-	float c1hix = (node0.w - P.x) * idir.x;
-	float c1loy = (node1.y - P.y) * idir.y;
-	float c1hiy = (node1.w - P.y) * idir.y;
-	float c1loz = (node2.y - P.z) * idir.z;
-	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-	dist[0] = c0min;
-	dist[1] = c1min;
-
-#ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-#else
-	return ((c0max >= c0min)? 1: 0) |
-	       ((c1max >= c1min)? 2: 0);
-#endif
-}
-
-ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 idir,
-                                                        const float t,
-                                                        const float difl,
-                                                        const float extmax,
-                                                        const int nodeAddr,
-                                                        const uint visibility,
-                                                        float dist[2])
-{
-
-	/* fetch node data */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr+1);
-	float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr+2);
-	float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr+3);
-
-	/* intersect ray against child nodes */
-	float c0lox = (node0.x - P.x) * idir.x;
-	float c0hix = (node0.z - P.x) * idir.x;
-	float c0loy = (node1.x - P.y) * idir.y;
-	float c0hiy = (node1.z - P.y) * idir.y;
-	float c0loz = (node2.x - P.z) * idir.z;
-	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-	float c1lox = (node0.y - P.x) * idir.x;
-	float c1hix = (node0.w - P.x) * idir.x;
-	float c1loy = (node1.y - P.y) * idir.y;
-	float c1hiy = (node1.w - P.y) * idir.y;
-	float c1loz = (node2.y - P.z) * idir.z;
-	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-	if(difl != 0.0f) {
-		float hdiff = 1.0f + difl;
-		float ldiff = 1.0f - difl;
-		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-			c0min = max(ldiff * c0min, c0min - extmax);
-			c0max = min(hdiff * c0max, c0max + extmax);
-		}
-		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-			c1min = max(ldiff * c1min, c1min - extmax);
-			c1max = min(hdiff * c1max, c1max + extmax);
-		}
-	}
-
-	dist[0] = c0min;
-	dist[1] = c1min;
-
-#ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-#else
-	return ((c0max >= c0min)? 1: 0) |
-	       ((c1max >= c1min)? 2: 0);
-#endif
-}
-
-ccl_device_inline bool bvh_unaligned_node_intersect_child(
-        KernelGlobals *kg,
-        const float3 P,
-        const float3 dir,
-        const float t,
-        int nodeAddr,
-        int child,
-        float dist[2])
-{
-	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
-	float3 aligned_dir = transform_direction(&space, dir);
-	float3 aligned_P = transform_point(&space, P);
-	float3 nrdir = -bvh_inverse_direction(aligned_dir);
-	float3 tLowerXYZ = aligned_P * nrdir;
-	float3 tUpperXYZ = tLowerXYZ - nrdir;
-	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
-	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
-	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
-	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
-	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
-	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
-	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
-	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
-	*dist = tNear;
-	return tNear <= tFar;
-}
-
-ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
-        KernelGlobals *kg,
-        const float3 P,
-        const float3 dir,
-        const float t,
-        const float difl,
-        int nodeAddr,
-        int child,
-        float dist[2])
-{
-	Transform space  = bvh_unaligned_node_fetch_space(kg, nodeAddr, child);
-	float3 aligned_dir = transform_direction(&space, dir);
-	float3 aligned_P = transform_point(&space, P);
-	float3 nrdir = -bvh_inverse_direction(aligned_dir);
-	float3 tLowerXYZ = aligned_P * nrdir;
-	float3 tUpperXYZ = tLowerXYZ - nrdir;
-	const float tNearX = min(tLowerXYZ.x, tUpperXYZ.x);
-	const float tNearY = min(tLowerXYZ.y, tUpperXYZ.y);
-	const float tNearZ = min(tLowerXYZ.z, tUpperXYZ.z);
-	const float tFarX  = max(tLowerXYZ.x, tUpperXYZ.x);
-	const float tFarY  = max(tLowerXYZ.y, tUpperXYZ.y);
-	const float tFarZ  = max(tLowerXYZ.z, tUpperXYZ.z);
-	const float tNear  = max4(0.0f, tNearX, tNearY, tNearZ);
-	const float tFar   = min4(t, tFarX, tFarY, tFarZ);
-	*dist = tNear;
-	if(difl != 0.0f) {
-		/* TODO(sergey): Same as for QBVH, needs a proper use. */
-		const float round_down = 1.0f - difl;
-		const float round_up = 1.0f + difl;
-		return round_down*tNear <= round_up*tFar;
-	}
-	else {
-		return tNear <= tFar;
-	}
-}
-
-ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const float3 idir,
-                                                   const float t,
-                                                   const int nodeAddr,
-                                                   const uint visibility,
-                                                   float dist[2])
-{
-	int mask = 0;
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 0, &dist[0])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.x) & visibility))
-#endif
-		{
-			mask |= 1;
-		}
-	}
-	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, nodeAddr, 1, &dist[1])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.y) & visibility))
-#endif
-		{
-			mask |= 2;
-		}
-	}
-	return mask;
-}
-
-ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const float3 idir,
-                                                          const float t,
-                                                          const float difl,
-                                                          const float extmax,
-                                                          const int nodeAddr,
-                                                          const uint visibility,
-                                                          float dist[2])
-{
-	int mask = 0;
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, nodeAddr, 0, &dist[0])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.x) & visibility))
-#endif
-		{
-			mask |= 1;
-		}
-	}
-	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, nodeAddr, 1, &dist[1])) {
-#ifdef __VISIBILITY_FLAG__
-		if((__float_as_uint(cnodes.y) & visibility))
-#endif
-		{
-			mask |= 2;
-		}
-	}
-	return mask;
-}
-
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3 P,
-                                         const float3 dir,
-                                         const float3 idir,
-                                         const float t,
-                                         const int nodeAddr,
-                                         const uint visibility,
-                                         float dist[2])
-{
-	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect(kg,
-		                                    P,
-		                                    dir,
-		                                    idir,
-		                                    t,
-		                                    nodeAddr,
-		                                    visibility,
-		                                    dist);
-	}
-	else {
-		return bvh_aligned_node_intersect(kg,
-		                                  P,
-		                                  idir,
-		                                  t,
-		                                  nodeAddr,
-		                                  visibility,
-		                                  dist);
-	}
-}
-
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3 P,
-                                                const float3 dir,
-                                                const float3 idir,
-                                                const float t,
-                                                const float difl,
-                                                const float extmax,
-                                                const int nodeAddr,
-                                                const uint visibility,
-                                                float dist[2])
-{
-	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect_robust(kg,
-		                                           P,
-		                                           dir,
-		                                           idir,
-		                                           t,
-		                                           difl,
-		                                           extmax,
-		                                           nodeAddr,
-		                                           visibility,
-		                                           dist);
-	}
-	else {
-		return bvh_aligned_node_intersect_robust(kg,
-		                                         P,
-		                                         idir,
-		                                         t,
-		                                         difl,
-		                                         extmax,
-		                                         nodeAddr,
-		                                         visibility,
-		                                         dist);
-	}
-}
-#else  /* !defined(__KERNEL_SSE2__) */
-
-int ccl_device_inline bvh_aligned_node_intersect(
-        KernelGlobals *kg,
-        const float3& P,
-        const float3& dir,
-        const ssef& tsplat,
-        const ssef Psplat[3],
-        const ssef idirsplat[3],
-        const shuffle_swap_t shufflexyz[3],
-        const int nodeAddr,
-        const uint visibility,
-        float dist[2])
-{
-	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-
-	/* fetch node data */
-	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-
-	/* intersect ray against child nodes */
-	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-	/* calculate { c0min, c1min, -c0max, -c1max} */
-	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-	const ssef tminmax = minmax ^ pn;
-	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-	dist[0] = tminmax[0];
-	dist[1] = tminmax[1];
-
-	int mask = movemask(lrhit);
-
-#  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
-#  else
-	return mask & 3;
-#  endif
-}
-
-int ccl_device_inline bvh_aligned_node_intersect_robust(
-        KernelGlobals *kg,
-        const float3& P,
-        const float3& dir,
-        const ssef& tsplat,
-        const ssef Psplat[3],
-        const ssef idirsplat[3],
-        const shuffle_swap_t shufflexyz[3],
-        const float difl,
-        const float extmax,
-        const int nodeAddr,
-        const uint visibility,
-        float dist[2])
-{
-	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-
-	/* fetch node data */
-	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
-
-	/* intersect ray against child nodes */
-	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-	/* calculate { c0min, c1min, -c0max, -c1max} */
-	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-	const ssef tminmax = minmax ^ pn;
-
-	if(difl != 0.0f) {
-		float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-		float4 *tminmaxview = (float4*)&tminmax;
-		float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
-		float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
-		float hdiff = 1.0f + difl;
-		float ldiff = 1.0f - difl;
-		if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
-			c0min = max(ldiff * c0min, c0min - extmax);
-			c0max = min(hdiff * c0max, c0max + extmax);
-		}
-		if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
-			c1min = max(ldiff * c1min, c1min - extmax);
-			c1max = min(hdiff * c1max, c1max + extmax);
-		}
-	}
-
-	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-	dist[0] = tminmax[0];
-	dist[1] = tminmax[1];
-
-	int mask = movemask(lrhit);
-
-#  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
-#  else
-	return mask & 3;
-#  endif
-}
-
-int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const ssef& tnear,
-                                                   const ssef& tfar,
-                                                   const int nodeAddr,
-                                                   const uint visibility,
-                                                   float dist[2])
-{
-	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
-	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
-
-	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
-	float3 aligned_P0 = transform_point(&space0, P),
-	       aligned_P1 = transform_point(&space1, P);
-	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
-	                    aligned_P1.x * nrdir1.x,
-	                    0.0f, 0.0f),
-	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
-	                    aligned_P1.y * nrdir1.y,
-	                    0.0f,
-	                    0.0f),
-	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
-	                    aligned_P1.z * nrdir1.z,
-	                    0.0f,
-	                    0.0f);
-
-	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-	ssef tnear_x = min(tLowerX, tUpperX);
-	ssef tnear_y = min(tLowerY, tUpperY);
-	ssef tnear_z = min(tLowerZ, tUpperZ);
-	ssef tfar_x = max(tLowerX, tUpperX);
-	ssef tfar_y = max(tLowerY, tUpperY);
-	ssef tfar_z = max(tLowerZ, tUpperZ);
-
-	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
-	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
-	sseb vmask = tNear <= tFar;
-	dist[0] = tNear.f[0];
-	dist[1] = tNear.f[1];
-
-	int mask = (int)movemask(vmask);
-
-#  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
-#  else
-	return mask & 3;
-#  endif
-}
-
-int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const ssef& tnear,
-                                                          const ssef& tfar,
-                                                          const float difl,
-                                                          const int nodeAddr,
-                                                          const uint visibility,
-                                                          float dist[2])
-{
-	Transform space0 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 0);
-	Transform space1 = bvh_unaligned_node_fetch_space(kg, nodeAddr, 1);
-
-	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
-	float3 aligned_P0 = transform_point(&space0, P),
-	       aligned_P1 = transform_point(&space1, P);
-	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-	ssef tLowerX = ssef(aligned_P0.x * nrdir0.x,
-	                    aligned_P1.x * nrdir1.x,
-	                    0.0f, 0.0f),
-	     tLowerY = ssef(aligned_P0.y * nrdir0.y,
-	                    aligned_P1.y * nrdir1.y,
-	                    0.0f,
-	                    0.0f),
-	     tLowerZ = ssef(aligned_P0.z * nrdir0.z,
-	                    aligned_P1.z * nrdir1.z,
-	                    0.0f,
-	                    0.0f);
-
-	ssef tUpperX = tLowerX - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-	     tUpperY = tLowerY - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-	     tUpperZ = tLowerZ - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-	ssef tnear_x = min(tLowerX, tUpperX);
-	ssef tnear_y = min(tLowerY, tUpperY);
-	ssef tnear_z = min(tLowerZ, tUpperZ);
-	ssef tfar_x = max(tLowerX, tUpperX);
-	ssef tfar_y = max(tLowerY, tUpperY);
-	ssef tfar_z = max(tLowerZ, tUpperZ);
-
-	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
-	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
-	sseb vmask;
-	if(difl != 0.0f) {
-		const float round_down = 1.0f - difl;
-		const float round_up = 1.0f + difl;
-		vmask = round_down*tNear <= round_up*tFar;
-	}
-	else {
-		vmask = tNear <= tFar;
-	}
-
-	dist[0] = tNear.f[0];
-	dist[1] = tNear.f[1];
-
-	int mask = (int)movemask(vmask);
-
-#  ifdef __VISIBILITY_FLAG__
-	/* this visibility test gives a 5% performance hit, how to solve? */
-	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
-	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
-	return cmask;
-#  else
-	return mask & 3;
-#  endif
-}
-
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3& P,
-                                         const float3& dir,
-                                         const ssef& tnear,
-                                         const ssef& tfar,
-                                         const ssef& tsplat,
-                                         const ssef Psplat[3],
-                                         const ssef idirsplat[3],
-                                         const shuffle_swap_t shufflexyz[3],
-                                         const int nodeAddr,
-                                         const uint visibility,
-                                         float dist[2])
-{
-	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect(kg,
-		                                    P,
-		                                    dir,
-		                                    tnear,
-		                                    tfar,
-		                                    nodeAddr,
-		                                    visibility,
-		                                    dist);
-	}
-	else {
-		return bvh_aligned_node_intersect(kg,
-		                                  P,
-		                                  dir,
-		                                  tsplat,
-		                                  Psplat,
-		                                  idirsplat,
-		                                  shufflexyz,
-		                                  nodeAddr,
-		                                  visibility,
-		                                  dist);
-	}
-}
-
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3& P,
-                                                const float3& dir,
-                                                const ssef& tnear,
-                                                const ssef& tfar,
-                                                const ssef& tsplat,
-                                                const ssef Psplat[3],
-                                                const ssef idirsplat[3],
-                                                const shuffle_swap_t shufflexyz[3],
-                                                const float difl,
-                                                const float extmax,
-                                                const int nodeAddr,
-                                                const uint visibility,
-                                                float dist[2])
-{
-	float4 node = kernel_tex_fetch(__bvh_nodes, nodeAddr);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return bvh_unaligned_node_intersect_robust(kg,
-		                                           P,
-		                                           dir,
-		                                           tnear,
-		                                           tfar,
-		                                           difl,
-		                                           nodeAddr,
-		                                           visibility,
-		                                           dist);
-	}
-	else {
-		return bvh_aligned_node_intersect_robust(kg,
-		                                         P,
-		                                         dir,
-		                                         tsplat,
-		                                         Psplat,
-		                                         idirsplat,
-		                                         shufflexyz,
-		                                         difl,
-		                                         extmax,
-		                                         nodeAddr,
-		                                         visibility,
-		                                         dist);
-	}
-}
-#endif  /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
deleted file mode 100644
index a54c6024152..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2013, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __QBVH__
-#  include "geom_qbvh_shadow.h"
-#endif
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT bvh_node_intersect
-#else
-#  define NODE_INTERSECT bvh_aligned_node_intersect
-#endif
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- *
- */
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            uint *num_hits)
-{
-	/* todo:
-	 * - likely and unlikely for if() statements
-	 * - test restrict attribute for pointers
-	 */
-
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* ray parameters in registers */
-	const float tmax = ray->t;
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-	float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-#if BVH_FEATURE(BVH_INSTANCING)
-	int num_hits_in_instance = 0;
-#endif
-
-	*num_hits = 0;
-	isect_array->t = tmax;
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-	ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-	ssef tnear(0.0f), tfar(isect_t);
-#  endif
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = ssef(P.x);
-	Psplat[1] = ssef(P.y);
-	Psplat[2] = ssef(P.z);
-
-	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif  /* __KERNEL_SSE2__ */
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* traversal loop */
-	do {
-		do {
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				int nodeAddrChild1, traverse_mask;
-				float dist[2];
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-#if !defined(__KERNEL_SSE2__)
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               dir,
-#  endif
-				                               idir,
-				                               isect_t,
-				                               nodeAddr,
-				                               PATH_RAY_SHADOW,
-				                               dist);
-#else // __KERNEL_SSE2__
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-				                               dir,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               tnear,
-				                               tfar,
-#  endif
-				                               tsplat,
-				                               Psplat,
-				                               idirsplat,
-				                               shufflexyz,
-				                               nodeAddr,
-				                               PATH_RAY_SHADOW,
-				                               dist);
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.z);
-				nodeAddrChild1 = __float_as_int(cnodes.w);
-
-				if(traverse_mask == 3) {
-					/* Both children were intersected, push the farther one. */
-					bool closestChild1 = (dist[1] < dist[0]);
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* One child was intersected. */
-					if(traverse_mask == 2) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(traverse_mask == 0) {
-						/* Neither child was intersected. */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					const int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-					const uint p_type = type & PRIMITIVE_ALL;
-
-					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
-
-					/* primitive intersection */
-					while(primAddr < primAddr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-
-						bool hit;
-
-						/* todo: specialized intersect functions which don't fill in
-						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
-						 * might give a few % performance improvement */
-
-						switch(p_type) {
-							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
-								break;
-							}
-#if BVH_FEATURE(BVH_MOTION)
-							case PRIMITIVE_MOTION_TRIANGLE: {
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
-								break;
-							}
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-							case PRIMITIVE_CURVE:
-							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
-									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
-								else
-									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
-								break;
-							}
-#endif
-							default: {
-								hit = false;
-								break;
-							}
-						}
-
-						/* shadow ray early termination */
-						if(hit) {
-							/* detect if this surface has a shader with transparent shadows */
-
-							/* todo: optimize so primitive visibility flag indicates if
-							 * the primitive has a transparent shadow shader? */
-							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-							int shader = 0;
-
-#ifdef __HAIR__
-							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-							{
-								shader = kernel_tex_fetch(__tri_shader, prim);
-							}
-#ifdef __HAIR__
-							else {
-								float4 str = kernel_tex_fetch(__curves, prim);
-								shader = __float_as_int(str.z);
-							}
-#endif
-							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
-
-							/* if no transparent shadows, all light is blocked */
-							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-								return true;
-							}
-							/* if maximum number of hits reached, block all light */
-							else if(*num_hits == max_hits) {
-								return true;
-							}
-
-							/* move on to next entry in intersections array */
-							isect_array++;
-							(*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-							num_hits_in_instance++;
-#endif
-
-							isect_array->t = isect_t;
-						}
-
-						primAddr++;
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
-#  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
-#  endif
-
-					triangle_intersect_precalc(dir, &isect_precalc);
-					num_hits_in_instance = 0;
-					isect_array->t = isect_t;
-
-#  if defined(__KERNEL_SSE2__)
-					Psplat[0] = ssef(P.x);
-					Psplat[1] = ssef(P.y);
-					Psplat[2] = ssef(P.z);
-
-					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-					tfar = ssef(isect_t);
-#    endif
-					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
-
-					nodeAddr = kernel_tex_fetch(__object_node, object);
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			if(num_hits_in_instance) {
-				float t_fac;
-
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-
-				triangle_intersect_precalc(dir, &isect_precalc);
-
-				/* scale isect->t to adjust for instancing */
-				for(int i = 0; i < num_hits_in_instance; i++)
-					(isect_array-i-1)->t *= t_fac;
-			}
-			else {
-				float ignore_t = FLT_MAX;
-
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
-#  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
-#  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
-			}
-
-			isect_t = tmax;
-			isect_array->t = isect_t;
-
-#  if defined(__KERNEL_SSE2__)
-			Psplat[0] = ssef(P.x);
-			Psplat[1] = ssef(P.y);
-			Psplat[2] = ssef(P.z);
-
-			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-			tfar = ssef(isect_t);
-#    endif
-			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return false;
-}
-
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
-                                         const Ray *ray,
-                                         Intersection *isect_array,
-                                         const uint max_hits,
-                                         uint *num_hits)
-{
-#ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect_array,
-		                                    max_hits,
-		                                    num_hits);
-	}
-	else
-#endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect_array,
-		                                   max_hits,
-		                                   num_hits);
-	}
-}
-
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
deleted file mode 100644
index 88aaf01d682..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2013, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __QBVH__
-#  include "geom_qbvh_subsurface.h"
-#endif
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT bvh_node_intersect
-#else
-#  define NODE_INTERSECT bvh_aligned_node_intersect
-#endif
-
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_MOTION: motion blur rendering
- *
- */
-
-ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            SubsurfaceIntersection *ss_isect,
-                                            int subsurface_object,
-                                            uint *lcg_state,
-                                            int max_hits)
-{
-	/* todo:
-	 * - test if pushing distance on the stack helps (for non shadow rays)
-	 * - separate version for shadow rays
-	 * - likely and unlikely for if() statements
-	 * - test restrict attribute for pointers
-	 */
-
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object);
-
-	/* ray parameters in registers */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-	float isect_t = ray->t;
-
-	ss_isect->num_hits = 0;
-
-	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
-	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
-#if BVH_FEATURE(BVH_MOTION)
-		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
-#else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
-#endif
-		object = subsurface_object;
-	}
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-	ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-	ssef tnear(0.0f), tfar(isect_t);
-#  endif
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = ssef(P.x);
-	Psplat[1] = ssef(P.y);
-	Psplat[2] = ssef(P.z);
-
-	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* traversal loop */
-	do {
-		do {
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				int nodeAddrChild1, traverse_mask;
-				float dist[2];
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-#if !defined(__KERNEL_SSE2__)
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               dir,
-#  endif
-				                               idir,
-				                               isect_t,
-				                               nodeAddr,
-				                               PATH_RAY_ALL_VISIBILITY,
-				                               dist);
-#else // __KERNEL_SSE2__
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-				                               dir,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               tnear,
-				                               tfar,
-#  endif
-				                               tsplat,
-				                               Psplat,
-				                               idirsplat,
-				                               shufflexyz,
-				                               nodeAddr,
-				                               PATH_RAY_ALL_VISIBILITY,
-				                               dist);
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.z);
-				nodeAddrChild1 = __float_as_int(cnodes.w);
-
-				if(traverse_mask == 3) {
-					/* Both children were intersected, push the farther one. */
-					bool closestChild1 = (dist[1] < dist[0]);
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* One child was intersected. */
-					if(traverse_mask == 2) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(traverse_mask == 0) {
-						/* Neither child was intersected. */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-				const int primAddr2 = __float_as_int(leaf.y);
-				const uint type = __float_as_int(leaf.w);
-
-				/* pop */
-				nodeAddr = traversalStack[stackPtr];
-				--stackPtr;
-
-				/* primitive intersection */
-				switch(type & PRIMITIVE_ALL) {
-					case PRIMITIVE_TRIANGLE: {
-						/* intersect ray against primitive */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
-							                              ss_isect,
-							                              P,
-							                              object,
-							                              primAddr,
-							                              isect_t,
-							                              lcg_state,
-							                              max_hits);
-						}
-						break;
-					}
-#if BVH_FEATURE(BVH_MOTION)
-					case PRIMITIVE_MOTION_TRIANGLE: {
-						/* intersect ray against primitive */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-							motion_triangle_intersect_subsurface(kg,
-							                                     ss_isect,
-							                                     P,
-							                                     dir,
-							                                     ray->time,
-							                                     object,
-							                                     primAddr,
-							                                     isect_t,
-							                                     lcg_state,
-							                                     max_hits);
-						}
-						break;
-					}
-#endif
-					default: {
-						break;
-					}
-				}
-			}
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-}
-
-ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
-                                         const Ray *ray,
-                                         SubsurfaceIntersection *ss_isect,
-                                         int subsurface_object,
-                                         uint *lcg_state,
-                                         int max_hits)
-{
-#ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    ss_isect,
-		                                    subsurface_object,
-		                                    lcg_state,
-		                                    max_hits);
-	}
-	else
-#endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   ss_isect,
-		                                   subsurface_object,
-		                                   lcg_state,
-		                                   max_hits);
-	}
-}
-
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
deleted file mode 100644
index f409dd5f403..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2013, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __QBVH__
-#  include "geom_qbvh_traversal.h"
-#endif
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT bvh_node_intersect
-#  define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
-#else
-#  define NODE_INTERSECT bvh_aligned_node_intersect
-#  define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
-#endif
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
- * BVH_MOTION: motion blur rendering
- *
- */
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                            , uint *lcg_state,
-                                            float difl,
-                                            float extmax
-#endif
-                                            )
-{
-	/* todo:
-	 * - test if pushing distance on the stack helps (for non shadow rays)
-	 * - separate version for shadow rays
-	 * - likely and unlikely for if() statements
-	 * - test restrict attribute for pointers
-	 */
-
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* ray parameters in registers */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-	isect->t = ray->t;
-	isect->u = 0.0f;
-	isect->v = 0.0f;
-	isect->prim = PRIM_NONE;
-	isect->object = OBJECT_NONE;
-
-	BVH_DEBUG_INIT();
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-	ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-	ssef tnear(0.0f), tfar(isect->t);
-#  endif
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = ssef(P.x);
-	Psplat[1] = ssef(P.y);
-	Psplat[2] = ssef(P.z);
-
-	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* traversal loop */
-	do {
-		do {
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				int nodeAddrChild1, traverse_mask;
-				float dist[2];
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-#if !defined(__KERNEL_SSE2__)
-#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-				if(difl != 0.0f) {
-					traverse_mask = NODE_INTERSECT_ROBUST(kg,
-					                                      P,
-#    if BVH_FEATURE(BVH_HAIR)
-					                                      dir,
-#    endif
-					                                      idir,
-					                                      isect->t,
-					                                      difl,
-					                                      extmax,
-					                                      nodeAddr,
-					                                      visibility,
-					                                      dist);
-				}
-				else
-#  endif
-				{
-					traverse_mask = NODE_INTERSECT(kg,
-					                               P,
-#    if BVH_FEATURE(BVH_HAIR)
-					                               dir,
-#    endif
-					                               idir,
-					                               isect->t,
-					                               nodeAddr,
-					                               visibility,
-					                               dist);
-				}
-#else // __KERNEL_SSE2__
-#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-				if(difl != 0.0f) {
-					traverse_mask = NODE_INTERSECT_ROBUST(kg,
-					                                      P,
-					                                      dir,
-#    if BVH_FEATURE(BVH_HAIR)
-					                                      tnear,
-					                                      tfar,
-#    endif
-					                                      tsplat,
-					                                      Psplat,
-					                                      idirsplat,
-					                                      shufflexyz,
-					                                      difl,
-					                                      extmax,
-					                                      nodeAddr,
-					                                      visibility,
-					                                      dist);
-				}
-				else
-#  endif
-				{
-					traverse_mask = NODE_INTERSECT(kg,
-					                               P,
-					                               dir,
-#    if BVH_FEATURE(BVH_HAIR)
-					                               tnear,
-					                               tfar,
-#    endif
-					                               tsplat,
-					                               Psplat,
-					                               idirsplat,
-					                               shufflexyz,
-					                               nodeAddr,
-					                               visibility,
-					                               dist);
-				}
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.z);
-				nodeAddrChild1 = __float_as_int(cnodes.w);
-
-				if(traverse_mask == 3) {
-					/* Both children were intersected, push the farther one. */
-					bool closestChild1 = (dist[1] < dist[0]);
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* One child was intersected. */
-					if(traverse_mask == 2) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(traverse_mask == 0) {
-						/* Neither child was intersected. */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-				BVH_DEBUG_NEXT_STEP();
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					const int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-
-					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
-
-					/* primitive intersection */
-					switch(type & PRIMITIVE_ALL) {
-						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
-									/* shadow ray early termination */
-#if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-									tfar = ssef(isect->t);
-#  endif
-#else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-#endif
-								}
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
-									/* shadow ray early termination */
-#  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-									tfar = ssef(isect->t);
-#    endif
-#  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-#  endif
-								}
-							}
-							break;
-						}
-#endif  /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-						case PRIMITIVE_CURVE:
-						case PRIMITIVE_MOTION_CURVE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								bool hit;
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
-									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								else
-									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								if(hit) {
-									/* shadow ray early termination */
-#  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-									tfar = ssef(isect->t);
-#    endif
-#  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-#  endif
-								}
-							}
-							break;
-						}
-#endif  /* BVH_FEATURE(BVH_HAIR) */
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-					triangle_intersect_precalc(dir, &isect_precalc);
-
-#  if defined(__KERNEL_SSE2__)
-					Psplat[0] = ssef(P.x);
-					Psplat[1] = ssef(P.y);
-					Psplat[2] = ssef(P.z);
-
-					tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-					tfar = ssef(isect->t);
-#    endif
-
-					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
-
-					nodeAddr = kernel_tex_fetch(__object_node, object);
-
-					BVH_DEBUG_NEXT_INSTANCE();
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			/* instance pop */
-#  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-			triangle_intersect_precalc(dir, &isect_precalc);
-
-#  if defined(__KERNEL_SSE2__)
-			Psplat[0] = ssef(P.x);
-			Psplat[1] = ssef(P.y);
-			Psplat[2] = ssef(P.z);
-
-			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-			tfar = ssef(isect->t);
-#    endif
-
-			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return (isect->prim != PRIM_NONE);
-}
-
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
-                                         const Ray *ray,
-                                         Intersection *isect,
-                                         const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                         , uint *lcg_state,
-                                         float difl,
-                                         float extmax
-#endif
-                                         )
-{
-#ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect,
-		                                    visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-		                                    , lcg_state,
-		                                    difl,
-		                                    extmax
-#endif
-		                                    );
-	}
-	else
-#endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect,
-		                                   visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-		                                   , lcg_state,
-		                                   difl,
-		                                   extmax
-#endif
-		                                   );
-	}
-}
-
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-#undef NODE_INTERSECT
-#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
deleted file mode 100644
index 5e70ce99f51..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __QBVH__
-#  include "geom_qbvh_volume.h"
-#endif
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT bvh_node_intersect
-#else
-#  define NODE_INTERSECT bvh_aligned_node_intersect
-#endif
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- *
- */
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility)
-{
-	/* todo:
-	 * - test if pushing distance on the stack helps (for non shadow rays)
-	 * - separate version for shadow rays
-	 * - likely and unlikely for if() statements
-	 * - test restrict attribute for pointers
-	 */
-
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* ray parameters in registers */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-	isect->t = ray->t;
-	isect->u = 0.0f;
-	isect->v = 0.0f;
-	isect->prim = PRIM_NONE;
-	isect->object = OBJECT_NONE;
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-	ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-	ssef tnear(0.0f), tfar(isect->t);
-#  endif
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = ssef(P.x);
-	Psplat[1] = ssef(P.y);
-	Psplat[2] = ssef(P.z);
-
-	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* traversal loop */
-	do {
-		do {
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				int nodeAddrChild1, traverse_mask;
-				float dist[2];
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-#if !defined(__KERNEL_SSE2__)
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               dir,
-#  endif
-				                               idir,
-				                               isect->t,
-				                               nodeAddr,
-				                               visibility,
-				                               dist);
-#else // __KERNEL_SSE2__
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-				                               dir,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               tnear,
-				                               tfar,
-#  endif
-				                               tsplat,
-				                               Psplat,
-				                               idirsplat,
-				                               shufflexyz,
-				                               nodeAddr,
-				                               visibility,
-				                               dist);
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.z);
-				nodeAddrChild1 = __float_as_int(cnodes.w);
-
-				if(traverse_mask == 3) {
-					/* Both children were intersected, push the farther one. */
-					bool closestChild1 = (dist[1] < dist[0]);
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* One child was intersected. */
-					if(traverse_mask == 2) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(traverse_mask == 0) {
-						/* Neither child was intersected. */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					const int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-
-					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
-
-					/* primitive intersection */
-					switch(type & PRIMITIVE_ALL) {
-						case PRIMITIVE_TRIANGLE: {
-							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
-							}
-							break;
-						}
-#endif
-						default: {
-							break;
-						}
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-					int object_flag = kernel_tex_fetch(__object_flag, object);
-
-					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
-#  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-
-						triangle_intersect_precalc(dir, &isect_precalc);
-
-#  if defined(__KERNEL_SSE2__)
-						Psplat[0] = ssef(P.x);
-						Psplat[1] = ssef(P.y);
-						Psplat[2] = ssef(P.z);
-
-						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-						tfar = ssef(isect->t);
-#    endif
-
-						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_STACK_SIZE);
-						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
-
-						nodeAddr = kernel_tex_fetch(__object_node, object);
-					}
-					else {
-						/* pop */
-						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			/* instance pop */
-#  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-
-			triangle_intersect_precalc(dir, &isect_precalc);
-
-#  if defined(__KERNEL_SSE2__)
-			Psplat[0] = ssef(P.x);
-			Psplat[1] = ssef(P.y);
-			Psplat[2] = ssef(P.z);
-
-			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-			tfar = ssef(isect->t);
-#    endif
-
-			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_MOTION) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return (isect->prim != PRIM_NONE);
-}
-
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
-                                         const Ray *ray,
-                                         Intersection *isect,
-                                         const uint visibility)
-{
-#ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect,
-		                                    visibility);
-	}
-	else
-#endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect,
-		                                   visibility);
-	}
-}
-
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
deleted file mode 100644
index ab5ac8505a3..00000000000
--- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __QBVH__
-#  include "geom_qbvh_volume_all.h"
-#endif
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT bvh_node_intersect
-#else
-#  define NODE_INTERSECT bvh_aligned_node_intersect
-#endif
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- *
- */
-
-ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            const uint visibility)
-{
-	/* todo:
-	 * - test if pushing distance on the stack helps (for non shadow rays)
-	 * - separate version for shadow rays
-	 * - likely and unlikely for if() statements
-	 * - test restrict attribute for pointers
-	 */
-
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* ray parameters in registers */
-	const float tmax = ray->t;
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-	float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-#if BVH_FEATURE(BVH_INSTANCING)
-	int num_hits_in_instance = 0;
-#endif
-
-	uint num_hits = 0;
-	isect_array->t = tmax;
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-	ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-	ssef tnear(0.0f), tfar(isect_t);
-#  endif
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = ssef(P.x);
-	Psplat[1] = ssef(P.y);
-	Psplat[2] = ssef(P.z);
-
-	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* traversal loop */
-	do {
-		do {
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				int nodeAddrChild1, traverse_mask;
-				float dist[2];
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-#if !defined(__KERNEL_SSE2__)
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               dir,
-#  endif
-				                               idir,
-				                               isect_t,
-				                               nodeAddr,
-				                               visibility,
-				                               dist);
-#else // __KERNEL_SSE2__
-				traverse_mask = NODE_INTERSECT(kg,
-				                               P,
-				                               dir,
-#  if BVH_FEATURE(BVH_HAIR)
-				                               tnear,
-				                               tfar,
-#  endif
-				                               tsplat,
-				                               Psplat,
-				                               idirsplat,
-				                               shufflexyz,
-				                               nodeAddr,
-				                               visibility,
-				                               dist);
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.z);
-				nodeAddrChild1 = __float_as_int(cnodes.w);
-
-				if(traverse_mask == 3) {
-					/* Both children were intersected, push the farther one. */
-					bool closestChild1 = (dist[1] < dist[0]);
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* One child was intersected. */
-					if(traverse_mask == 2) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(traverse_mask == 0) {
-						/* Neither child was intersected. */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					const int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-					bool hit;
-
-					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
-
-					/* primitive intersection */
-					switch(type & PRIMITIVE_ALL) {
-						case PRIMITIVE_TRIANGLE: {
-							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
-								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
-									num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
-									num_hits_in_instance++;
-#endif
-									isect_array->t = isect_t;
-									if(num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
-#  if BVH_FEATURE(BVH_MOTION)
-										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-										for(int i = 0; i < num_hits_in_instance; i++) {
-											(isect_array-i-1)->t *= t_fac;
-										}
-#endif  /* BVH_FEATURE(BVH_INSTANCING) */
-										return num_hits;
-									}
-								}
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
-								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
-									num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
-									num_hits_in_instance++;
-#  endif
-									isect_array->t = isect_t;
-									if(num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
-#    if BVH_FEATURE(BVH_MOTION)
-										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
-										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
-										for(int i = 0; i < num_hits_in_instance; i++) {
-											(isect_array-i-1)->t *= t_fac;
-										}
-#  endif  /* BVH_FEATURE(BVH_INSTANCING) */
-										return num_hits;
-									}
-								}
-							}
-							break;
-						}
-#endif  /* BVH_MOTION */
-						default: {
-							break;
-						}
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-					int object_flag = kernel_tex_fetch(__object_flag, object);
-
-					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
-#  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
-#  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
-#  endif
-
-						triangle_intersect_precalc(dir, &isect_precalc);
-						num_hits_in_instance = 0;
-						isect_array->t = isect_t;
-
-#  if defined(__KERNEL_SSE2__)
-						Psplat[0] = ssef(P.x);
-						Psplat[1] = ssef(P.y);
-						Psplat[2] = ssef(P.z);
-
-						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-						tfar = ssef(isect_t);
-#    endif
-
-						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_STACK_SIZE);
-						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
-
-						nodeAddr = kernel_tex_fetch(__object_node, object);
-					}
-					else {
-						/* pop */
-						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			if(num_hits_in_instance) {
-				float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
-				/* Scale isect->t to adjust for instancing. */
-				for(int i = 0; i < num_hits_in_instance; i++) {
-					(isect_array-i-1)->t *= t_fac;
-				}
-			}
-			else {
-				float ignore_t = FLT_MAX;
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
-#  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
-#  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
-			}
-
-			isect_t = tmax;
-			isect_array->t = isect_t;
-
-#  if defined(__KERNEL_SSE2__)
-			Psplat[0] = ssef(P.x);
-			Psplat[1] = ssef(P.y);
-			Psplat[2] = ssef(P.z);
-
-			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-			tfar = ssef(isect_t);
-#    endif
-
-			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_MOTION) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return num_hits;
-}
-
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
-                                         const Ray *ray,
-                                         Intersection *isect_array,
-                                         const uint max_hits,
-                                         const uint visibility)
-{
-#ifdef __QBVH__
-	if(kernel_data.bvh.use_qbvh) {
-		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-		                                    ray,
-		                                    isect_array,
-		                                    max_hits,
-		                                    visibility);
-	}
-	else
-#endif
-	{
-		kernel_assert(kernel_data.bvh.use_qbvh == false);
-		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-		                                   ray,
-		                                   isect_array,
-		                                   max_hits,
-		                                   visibility);
-	}
-}
-
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
deleted file mode 100644
index 5eda3213acb..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh.h
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- * Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-struct QBVHStackItem {
-	int addr;
-	float dist;
-};
-
-/* TOOD(sergey): Investigate if using intrinsics helps for both
- * stack item swap and float comparison.
- */
-ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
-                                      QBVHStackItem *__restrict b)
-{
-	QBVHStackItem tmp = *a;
-	*a = *b;
-	*b = tmp;
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
-                                       QBVHStackItem *__restrict s2,
-                                       QBVHStackItem *__restrict s3)
-{
-	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
-	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
-	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
-                                       QBVHStackItem *__restrict s2,
-                                       QBVHStackItem *__restrict s3,
-                                       QBVHStackItem *__restrict s4)
-{
-	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
-	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
-	if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
-	if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
-	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
-}
-
-/* Axis-aligned nodes intersection */
-
-ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
-                                                  const ssef& tnear,
-                                                  const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-                                                  const sse3f& org_idir,
-#else
-                                                  const sse3f& org,
-#endif
-                                                  const sse3f& idir,
-                                                  const int near_x,
-                                                  const int near_y,
-                                                  const int near_z,
-                                                  const int far_x,
-                                                  const int far_y,
-                                                  const int far_z,
-                                                  const int nodeAddr,
-                                                  ssef *__restrict dist)
-{
-	const int offset = nodeAddr + 1;
-#ifdef __KERNEL_AVX2__
-	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
-	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
-	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
-	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
-	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
-	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
-#else
-	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
-	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
-	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
-	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
-	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
-	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
-#endif
-
-#ifdef __KERNEL_SSE41__
-	const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear));
-	const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar));
-	const sseb vmask = cast(tNear) > cast(tFar);
-	int mask = (int)movemask(vmask)^0xf;
-#else
-	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
-	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
-	const sseb vmask = tNear <= tFar;
-	int mask = (int)movemask(vmask);
-#endif
-	*dist = tNear;
-	return mask;
-}
-
-ccl_device_inline int qbvh_aligned_node_intersect_robust(
-        KernelGlobals *__restrict kg,
-        const ssef& tnear,
-        const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-        const sse3f& P_idir,
-#else
-        const sse3f& P,
-#endif
-        const sse3f& idir,
-        const int near_x,
-        const int near_y,
-        const int near_z,
-        const int far_x,
-        const int far_y,
-        const int far_z,
-        const int nodeAddr,
-        const float difl,
-        ssef *__restrict dist)
-{
-	const int offset = nodeAddr + 1;
-#ifdef __KERNEL_AVX2__
-	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
-	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
-	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
-	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
-	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
-	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
-#else
-	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
-	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
-	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
-	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
-	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
-	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
-#endif
-
-	const float round_down = 1.0f - difl;
-	const float round_up = 1.0f + difl;
-	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
-	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
-	const sseb vmask = round_down*tNear <= round_up*tFar;
-	*dist = tNear;
-	return (int)movemask(vmask);
-}
-
-/* Unaligned nodes intersection */
-
-ccl_device_inline int qbvh_unaligned_node_intersect(
-        KernelGlobals *__restrict kg,
-        const ssef& tnear,
-        const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-        const sse3f& org_idir,
-#endif
-        const sse3f& org,
-        const sse3f& dir,
-        const sse3f& idir,
-        const int near_x,
-        const int near_y,
-        const int near_z,
-        const int far_x,
-        const int far_y,
-        const int far_z,
-        const int nodeAddr,
-        ssef *__restrict dist)
-{
-	const int offset = nodeAddr;
-	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
-	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
-	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
-
-	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
-	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
-	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
-
-	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
-	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
-	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
-
-	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
-	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
-	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
-
-	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
-	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
-	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
-
-	const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
-	           aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
-	           aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
-
-	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
-	const ssef nrdir_x = neg_one / aligned_dir_x,
-	           nrdir_y = neg_one / aligned_dir_y,
-	           nrdir_z = neg_one / aligned_dir_z;
-
-	const ssef tlower_x = aligned_P_x * nrdir_x,
-	           tlower_y = aligned_P_y * nrdir_y,
-	           tlower_z = aligned_P_z * nrdir_z;
-
-	const ssef tupper_x = tlower_x - nrdir_x,
-	           tupper_y = tlower_y - nrdir_y,
-	           tupper_z = tlower_z - nrdir_z;
-
-#ifdef __KERNEL_SSE41__
-	const ssef tnear_x = mini(tlower_x, tupper_x);
-	const ssef tnear_y = mini(tlower_y, tupper_y);
-	const ssef tnear_z = mini(tlower_z, tupper_z);
-	const ssef tfar_x = maxi(tlower_x, tupper_x);
-	const ssef tfar_y = maxi(tlower_y, tupper_y);
-	const ssef tfar_z = maxi(tlower_z, tupper_z);
-	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
-	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
-	const sseb vmask = tNear <= tFar;
-	*dist = tNear;
-	return movemask(vmask);
-#else
-	const ssef tnear_x = min(tlower_x, tupper_x);
-	const ssef tnear_y = min(tlower_y, tupper_y);
-	const ssef tnear_z = min(tlower_z, tupper_z);
-	const ssef tfar_x = max(tlower_x, tupper_x);
-	const ssef tfar_y = max(tlower_y, tupper_y);
-	const ssef tfar_z = max(tlower_z, tupper_z);
-	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
-	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
-	const sseb vmask = tNear <= tFar;
-	*dist = tNear;
-	return movemask(vmask);
-#endif
-}
-
-ccl_device_inline int qbvh_unaligned_node_intersect_robust(
-        KernelGlobals *__restrict kg,
-        const ssef& tnear,
-        const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-        const sse3f& P_idir,
-#endif
-        const sse3f& P,
-        const sse3f& dir,
-        const sse3f& idir,
-        const int near_x,
-        const int near_y,
-        const int near_z,
-        const int far_x,
-        const int far_y,
-        const int far_z,
-        const int nodeAddr,
-        const float difl,
-        ssef *__restrict dist)
-{
-	const int offset = nodeAddr;
-	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
-	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
-	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
-
-	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
-	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
-	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
-
-	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
-	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
-	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
-
-	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
-	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
-	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
-
-	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
-	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
-	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
-
-	const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
-	           aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
-	           aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
-
-	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
-	const ssef nrdir_x = neg_one / aligned_dir_x,
-	           nrdir_y = neg_one / aligned_dir_y,
-	           nrdir_z = neg_one / aligned_dir_z;
-
-	const ssef tlower_x = aligned_P_x * nrdir_x,
-	           tlower_y = aligned_P_y * nrdir_y,
-	           tlower_z = aligned_P_z * nrdir_z;
-
-	const ssef tupper_x = tlower_x - nrdir_x,
-	           tupper_y = tlower_y - nrdir_y,
-	           tupper_z = tlower_z - nrdir_z;
-
-	const float round_down = 1.0f - difl;
-	const float round_up = 1.0f + difl;
-
-#ifdef __KERNEL_SSE41__
-	const ssef tnear_x = mini(tlower_x, tupper_x);
-	const ssef tnear_y = mini(tlower_y, tupper_y);
-	const ssef tnear_z = mini(tlower_z, tupper_z);
-	const ssef tfar_x = maxi(tlower_x, tupper_x);
-	const ssef tfar_y = maxi(tlower_y, tupper_y);
-	const ssef tfar_z = maxi(tlower_z, tupper_z);
-#else
-	const ssef tnear_x = min(tlower_x, tupper_x);
-	const ssef tnear_y = min(tlower_y, tupper_y);
-	const ssef tnear_z = min(tlower_z, tupper_z);
-	const ssef tfar_x = max(tlower_x, tupper_x);
-	const ssef tfar_y = max(tlower_y, tupper_y);
-	const ssef tfar_z = max(tlower_z, tupper_z);
-#endif
-	const ssef tNear = max4(tnear, tnear_x, tnear_y, tnear_z);
-	const ssef tFar = min4(tfar, tfar_x, tfar_y, tfar_z);
-	const sseb vmask = round_down*tNear <= round_up*tFar;
-	*dist = tNear;
-	return movemask(vmask);
-}
-
-/* Intersectors wrappers.
- *
- * They'll check node type and call appropriate intersection code.
- */
-
-ccl_device_inline int qbvh_node_intersect(
-        KernelGlobals *__restrict kg,
-        const ssef& tnear,
-        const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-        const sse3f& org_idir,
-#endif
-        const sse3f& org,
-        const sse3f& dir,
-        const sse3f& idir,
-        const int near_x,
-        const int near_y,
-        const int near_z,
-        const int far_x,
-        const int far_y,
-        const int far_z,
-        const int nodeAddr,
-        ssef *__restrict dist)
-{
-	const int offset = nodeAddr;
-	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return qbvh_unaligned_node_intersect(kg,
-		                                     tnear,
-		                                     tfar,
-#ifdef __KERNEL_AVX2__
-		                                     org_idir,
-#endif
-		                                     org,
-		                                     dir,
-		                                     idir,
-		                                     near_x, near_y, near_z,
-		                                     far_x, far_y, far_z,
-		                                     nodeAddr,
-		                                     dist);
-	}
-	else {
-		return qbvh_aligned_node_intersect(kg,
-		                                   tnear,
-		                                   tfar,
-#ifdef __KERNEL_AVX2__
-		                                   org_idir,
-#else
-		                                   org,
-#endif
-		                                   idir,
-		                                   near_x, near_y, near_z,
-		                                   far_x, far_y, far_z,
-		                                   nodeAddr,
-		                                   dist);
-	}
-}
-
-ccl_device_inline int qbvh_node_intersect_robust(
-        KernelGlobals *__restrict kg,
-        const ssef& tnear,
-        const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-        const sse3f& P_idir,
-#endif
-        const sse3f& P,
-        const sse3f& dir,
-        const sse3f& idir,
-        const int near_x,
-        const int near_y,
-        const int near_z,
-        const int far_x,
-        const int far_y,
-        const int far_z,
-        const int nodeAddr,
-        const float difl,
-        ssef *__restrict dist)
-{
-	const int offset = nodeAddr;
-	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-		return qbvh_unaligned_node_intersect_robust(kg,
-		                                            tnear,
-		                                            tfar,
-#ifdef __KERNEL_AVX2__
-		                                            P_idir,
-#endif
-		                                            P,
-		                                            dir,
-		                                            idir,
-		                                            near_x, near_y, near_z,
-		                                            far_x, far_y, far_z,
-		                                            nodeAddr,
-		                                            difl,
-		                                            dist);
-	}
-	else {
-		return qbvh_aligned_node_intersect_robust(kg,
-		                                          tnear,
-		                                          tfar,
-#ifdef __KERNEL_AVX2__
-		                                          P_idir,
-#else
-		                                          P,
-#endif
-		                                          idir,
-		                                          near_x, near_y, near_z,
-		                                          far_x, far_y, far_z,
-		                                          nodeAddr,
-		                                          difl,
-		                                          dist);
-	}
-}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
deleted file mode 100644
index e5e611a0d47..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint max_hits,
-                                             uint *num_hits)
-{
-	/* TODO(sergey):
-	 * - Likely and unlikely for if() statements.
-	 * - Test restrict attribute for pointers.
-	 */
-
-	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
-
-	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* Ray parameters in registers. */
-	const float tmax = ray->t;
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-	float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-	*num_hits = 0;
-	isect_array->t = tmax;
-
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
-#if BVH_FEATURE(BVH_INSTANCING)
-	int num_hits_in_instance = 0;
-#endif
-
-	ssef tnear(0.0f), tfar(tmax);
-#if BVH_FEATURE(BVH_HAIR)
-	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-	float3 P_idir = P*idir;
-	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-	/* Offsets to select the side that becomes the lower or upper bound. */
-	int near_x, near_y, near_z;
-	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* Traversal loop. */
-	do {
-		do {
-			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-#ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-					continue;
-				}
-#endif
-
-				ssef dist;
-				int traverseChild = NODE_INTERSECT(kg,
-				                                   tnear,
-				                                   tfar,
-#ifdef __KERNEL_AVX2__
-				                                   P_idir4,
-#endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-				                                   org4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR)
-				                                   dir4,
-#  endif
-				                                   idir4,
-				                                   near_x, near_y, near_z,
-				                                   far_x, far_y, far_z,
-				                                   nodeAddr,
-				                                   &dist);
-
-				if(traverseChild != 0) {
-					float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
-					}
-					else
-#endif
-					{
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
-					}
-
-					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
-						continue;
-					}
-
-					/* Two children are hit, push far child, and continue with
-					 * closer child.
-					 */
-					int c0 = __float_as_int(cnodes[r]);
-					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
-					int c1 = __float_as_int(cnodes[r]);
-					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
-							continue;
-						}
-						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
-							continue;
-						}
-					}
-
-					/* Here starts the slow path for 3 or 4 hit children. We push
-					 * all nodes onto the stack to sort them there.
-					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
-
-					/* Three children are hit, push all onto stack and sort 3
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c2 = __float_as_int(cnodes[r]);
-					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
-						continue;
-					}
-
-					/* Four children are hit, push all onto stack and sort 4
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c3 = __float_as_int(cnodes[r]);
-					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
-				}
-
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
-			}
-
-			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-#ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-					continue;
-				}
-#endif
-
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-					const uint p_type = type & PRIMITIVE_ALL;
-
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-
-					/* Primitive intersection. */
-					while(primAddr < primAddr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-
-						bool hit;
-
-						/* todo: specialized intersect functions which don't fill in
-						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
-						 * might give a few % performance improvement */
-
-						switch(p_type) {
-							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
-								break;
-							}
-#if BVH_FEATURE(BVH_MOTION)
-							case PRIMITIVE_MOTION_TRIANGLE: {
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
-								break;
-							}
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-							case PRIMITIVE_CURVE:
-							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
-									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
-								else
-									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
-								break;
-							}
-#endif
-							default: {
-								hit = false;
-								break;
-							}
-						}
-
-						/* Shadow ray early termination. */
-						if(hit) {
-							/* detect if this surface has a shader with transparent shadows */
-
-							/* todo: optimize so primitive visibility flag indicates if
-							 * the primitive has a transparent shadow shader? */
-							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-							int shader = 0;
-
-#ifdef __HAIR__
-							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-							{
-								shader = kernel_tex_fetch(__tri_shader, prim);
-							}
-#ifdef __HAIR__
-							else {
-								float4 str = kernel_tex_fetch(__curves, prim);
-								shader = __float_as_int(str.z);
-							}
-#endif
-							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
-
-							/* if no transparent shadows, all light is blocked */
-							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-								return true;
-							}
-							/* if maximum number of hits reached, block all light */
-							else if(*num_hits == max_hits) {
-								return true;
-							}
-
-							/* move on to next entry in intersections array */
-							isect_array++;
-							(*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-							num_hits_in_instance++;
-#endif
-
-							isect_array->t = isect_t;
-						}
-
-						primAddr++;
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
-#  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
-#  endif
-
-					num_hits_in_instance = 0;
-					isect_array->t = isect_t;
-
-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-					tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-					P_idir = P*idir;
-					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-					triangle_intersect_precalc(dir, &isect_precalc);
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
-
-					nodeAddr = kernel_tex_fetch(__object_node, object);
-
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			if(num_hits_in_instance) {
-				float t_fac;
-
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-
-				/* scale isect->t to adjust for instancing */
-				for(int i = 0; i < num_hits_in_instance; i++)
-					(isect_array-i-1)->t *= t_fac;
-			}
-			else {
-				float ignore_t = FLT_MAX;
-
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
-#  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
-#  endif
-			}
-
-			isect_t = tmax;
-			isect_array->t = isect_t;
-
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-			tfar = ssef(tmax);
-#  if BVH_FEATURE(BVH_HAIR)
-			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-			P_idir = P*idir;
-			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-			triangle_intersect_precalc(dir, &isect_precalc);
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
deleted file mode 100644
index 4adaf9c8f3d..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             SubsurfaceIntersection *ss_isect,
-                                             int subsurface_object,
-                                             uint *lcg_state,
-                                             int max_hits)
-{
-	/* TODO(sergey):
-	 * - Test if pushing distance on the stack helps (for non shadow rays).
-	 * - Separate version for shadow rays.
-	 * - Likely and unlikely for if() statements.
-	 * - SSE for hair.
-	 * - Test restrict attribute for pointers.
-	 */
-
-	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
-
-	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object);
-
-	/* Ray parameters in registers. */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-	float isect_t = ray->t;
-
-	ss_isect->num_hits = 0;
-
-	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
-	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
-#if BVH_FEATURE(BVH_MOTION)
-		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
-#else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
-#endif
-		object = subsurface_object;
-	}
-
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return;
-	}
-#endif
-
-	ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-	float3 P_idir = P*idir;
-	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-	/* Offsets to select the side that becomes the lower or upper bound. */
-	int near_x, near_y, near_z;
-	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* Traversal loop. */
-	do {
-		do {
-			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				ssef dist;
-
-				int traverseChild = NODE_INTERSECT(kg,
-				                                   tnear,
-				                                   tfar,
-#ifdef __KERNEL_AVX2__
-				                                   P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-				                                   org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-				                                   dir4,
-#endif
-				                                   idir4,
-				                                   near_x, near_y, near_z,
-				                                   far_x, far_y, far_z,
-				                                   nodeAddr,
-				                                   &dist);
-
-				if(traverseChild != 0) {
-					float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-					float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
-					}
-					else
-#endif
-					{
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
-					}
-
-					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
-						continue;
-					}
-
-					/* Two children are hit, push far child, and continue with
-					 * closer child.
-					 */
-					int c0 = __float_as_int(cnodes[r]);
-					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
-					int c1 = __float_as_int(cnodes[r]);
-					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
-							continue;
-						}
-						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
-							continue;
-						}
-					}
-
-					/* Here starts the slow path for 3 or 4 hit children. We push
-					 * all nodes onto the stack to sort them there.
-					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
-
-					/* Three children are hit, push all onto stack and sort 3
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c2 = __float_as_int(cnodes[r]);
-					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
-						continue;
-					}
-
-					/* Four children are hit, push all onto stack and sort 4
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c3 = __float_as_int(cnodes[r]);
-					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
-				}
-
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
-			}
-
-			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-				int primAddr2 = __float_as_int(leaf.y);
-				const uint type = __float_as_int(leaf.w);
-
-				/* Pop. */
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
-
-				/* Primitive intersection. */
-				switch(type & PRIMITIVE_ALL) {
-					case PRIMITIVE_TRIANGLE: {
-						/* Intersect ray against primitive, */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
-							                              ss_isect,
-							                              P,
-							                              object,
-							                              primAddr,
-							                              isect_t,
-							                              lcg_state,
-							                              max_hits);
-						}
-						break;
-					}
-#if BVH_FEATURE(BVH_MOTION)
-					case PRIMITIVE_MOTION_TRIANGLE: {
-						/* Intersect ray against primitive. */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-							motion_triangle_intersect_subsurface(kg,
-							                                     ss_isect,
-							                                     P,
-							                                     dir,
-							                                     ray->time,
-							                                     object,
-							                                     primAddr,
-							                                     isect_t,
-							                                     lcg_state,
-							                                     max_hits);
-						}
-						break;
-					}
-#endif
-					default:
-						break;
-				}
-			}
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
deleted file mode 100644
index 24bf85f46c8..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h
+++ /dev/null
@@ -1,465 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#  define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#  define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                             ,uint *lcg_state,
-                                             float difl,
-                                             float extmax
-#endif
-                                             )
-{
-	/* TODO(sergey):
-	 * - Test if pushing distance on the stack helps (for non shadow rays).
-	 * - Separate version for shadow rays.
-	 * - Likely and unlikely for if() statements.
-	 * - Test restrict attribute for pointers.
-	 */
-
-	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
-	traversalStack[0].dist = -FLT_MAX;
-
-	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-	float nodeDist = -FLT_MAX;
-
-	/* Ray parameters in registers. */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
-	isect->t = ray->t;
-	isect->u = 0.0f;
-	isect->v = 0.0f;
-	isect->prim = PRIM_NONE;
-	isect->object = OBJECT_NONE;
-
-	BVH_DEBUG_INIT();
-
-	ssef tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-	sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-	/* Offsets to select the side that becomes the lower or upper bound. */
-	int near_x, near_y, near_z;
-	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* Traversal loop. */
-	do {
-		do {
-			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-
-				if(UNLIKELY(nodeDist > isect->t)
-#ifdef __VISIBILITY_FLAG__
-				   || (__float_as_uint(inodes.x) & visibility) == 0)
-#endif
-				{
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					nodeDist = traversalStack[stackPtr].dist;
-					--stackPtr;
-					continue;
-				}
-
-				int traverseChild;
-				ssef dist;
-
-				BVH_DEBUG_NEXT_STEP();
-
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-				if(difl != 0.0f) {
-					/* NOTE: We extend all the child BB instead of fetching
-					 * and checking visibility flags for each of the,
-					 *
-					 * Need to test if doing opposite would be any faster.
-					 */
-					traverseChild = NODE_INTERSECT_ROBUST(kg,
-					                                      tnear,
-					                                      tfar,
-#  ifdef __KERNEL_AVX2__
-					                                      P_idir4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-					                                      org4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR)
-					                                      dir4,
-#  endif
-					                                      idir4,
-					                                      near_x, near_y, near_z,
-					                                      far_x, far_y, far_z,
-					                                      nodeAddr,
-					                                      difl,
-					                                      &dist);
-				}
-				else
-#endif  /* BVH_HAIR_MINIMUM_WIDTH */
-				{
-					traverseChild = NODE_INTERSECT(kg,
-					                               tnear,
-					                               tfar,
-#ifdef __KERNEL_AVX2__
-					                               P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-					                               org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-					                               dir4,
-#endif
-					                               idir4,
-					                               near_x, near_y, near_z,
-					                               far_x, far_y, far_z,
-					                               nodeAddr,
-					                               &dist);
-				}
-
-				if(traverseChild != 0) {
-					float4 cnodes;
-					/* TODO(sergey): Investigate whether moving cnodes upwards
-					 * gives a speedup (will be different cache pattern but will
-					 * avoid extra check here),
-					 */
-#if BVH_FEATURE(BVH_HAIR)
-					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
-					}
-					else
-#endif
-					{
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
-					}
-
-					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					float d0 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
-						nodeDist = d0;
-						continue;
-					}
-
-					/* Two children are hit, push far child, and continue with
-					 * closer child.
-					 */
-					int c0 = __float_as_int(cnodes[r]);
-					r = __bscf(traverseChild);
-					int c1 = __float_as_int(cnodes[r]);
-					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						if(d1 < d0) {
-							nodeAddr = c1;
-							nodeDist = d1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
-							continue;
-						}
-						else {
-							nodeAddr = c0;
-							nodeDist = d0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
-							continue;
-						}
-					}
-
-					/* Here starts the slow path for 3 or 4 hit children. We push
-					 * all nodes onto the stack to sort them there.
-					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
-
-					/* Three children are hit, push all onto stack and sort 3
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c2 = __float_as_int(cnodes[r]);
-					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						nodeDist = traversalStack[stackPtr].dist;
-						--stackPtr;
-						continue;
-					}
-
-					/* Four children are hit, push all onto stack and sort 4
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c3 = __float_as_int(cnodes[r]);
-					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
-				}
-
-				nodeAddr = traversalStack[stackPtr].addr;
-				nodeDist = traversalStack[stackPtr].dist;
-				--stackPtr;
-			}
-
-			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-
-#ifdef __VISIBILITY_FLAG__
-				if(UNLIKELY((nodeDist > isect->t) ||
-				            ((__float_as_uint(leaf.z) & visibility) == 0)))
-#else
-				if(UNLIKELY((nodeDist > isect->t)))
-#endif
-				{
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					nodeDist = traversalStack[stackPtr].dist;
-					--stackPtr;
-					continue;
-				}
-
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					nodeDist = traversalStack[stackPtr].dist;
-					--stackPtr;
-
-					/* Primitive intersection. */
-					switch(type & PRIMITIVE_ALL) {
-						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
-									tfar = ssef(isect->t);
-									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-								}
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
-									tfar = ssef(isect->t);
-									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-								}
-							}
-							break;
-						}
-#endif  /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-						case PRIMITIVE_CURVE:
-						case PRIMITIVE_MOTION_CURVE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								bool hit;
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
-									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								else
-									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								if(hit) {
-									tfar = ssef(isect->t);
-									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-								}
-							}
-							break;
-						}
-#endif  /* BVH_FEATURE(BVH_HAIR) */
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_itfm);
-#  else
-					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist);
-#  endif
-
-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-					tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-					P_idir = P*idir;
-					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-					triangle_intersect_precalc(dir, &isect_precalc);
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
-					traversalStack[stackPtr].dist = -FLT_MAX;
-
-					nodeAddr = kernel_tex_fetch(__object_node, object);
-
-					BVH_DEBUG_NEXT_INSTANCE();
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			/* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-			tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-			P_idir = P*idir;
-			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-			triangle_intersect_precalc(dir, &isect_precalc);
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			nodeDist = traversalStack[stackPtr].dist;
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
-#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h
deleted file mode 100644
index da21ede9e12..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh_volume.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-	/* TODO(sergey):
-	 * - Test if pushing distance on the stack helps.
-	 * - Likely and unlikely for if() statements.
-	 * - Test restrict attribute for pointers.
-	 */
-
-	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
-
-	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* Ray parameters in registers. */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
-	isect->t = ray->t;
-	isect->u = 0.0f;
-	isect->v = 0.0f;
-	isect->prim = PRIM_NONE;
-	isect->object = OBJECT_NONE;
-
-	ssef tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-	float3 P_idir = P*idir;
-	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-	/* Offsets to select the side that becomes the lower or upper bound. */
-	int near_x, near_y, near_z;
-	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* Traversal loop. */
-	do {
-		do {
-			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-#ifdef __VISIBILITY_FLAG__
-				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				if((__float_as_uint(inodes.x) & visibility) == 0) {
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-					continue;
-				}
-#endif
-
-				ssef dist;
-				int traverseChild = NODE_INTERSECT(kg,
-				                                   tnear,
-				                                   tfar,
-#ifdef __KERNEL_AVX2__
-				                                   P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-				                                   org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-				                                   dir4,
-#endif
-				                                   idir4,
-				                                   near_x, near_y, near_z,
-				                                   far_x, far_y, far_z,
-				                                   nodeAddr,
-				                                   &dist);
-
-				if(traverseChild != 0) {
-					float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
-					}
-					else
-#endif
-					{
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
-					}
-
-					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
-						continue;
-					}
-
-					/* Two children are hit, push far child, and continue with
-					 * closer child.
-					 */
-					int c0 = __float_as_int(cnodes[r]);
-					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
-					int c1 = __float_as_int(cnodes[r]);
-					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
-							continue;
-						}
-						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
-							continue;
-						}
-					}
-
-					/* Here starts the slow path for 3 or 4 hit children. We push
-					 * all nodes onto the stack to sort them there.
-					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
-
-					/* Three children are hit, push all onto stack and sort 3
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c2 = __float_as_int(cnodes[r]);
-					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
-						continue;
-					}
-
-					/* Four children are hit, push all onto stack and sort 4
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c3 = __float_as_int(cnodes[r]);
-					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
-				}
-
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
-			}
-
-			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-					const uint p_type = type & PRIMITIVE_ALL;
-
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-
-					/* Primitive intersection. */
-					switch(p_type) {
-						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								/* Intersect ray against primitive. */
-								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
-							}
-							break;
-						}
-#endif
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-					int object_flag = kernel_tex_fetch(__object_flag, object);
-
-					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
-#  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-
-						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-						tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-						P_idir = P*idir;
-						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-						triangle_intersect_precalc(dir, &isect_precalc);
-
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
-
-						nodeAddr = kernel_tex_fetch(__object_node, object);
-					}
-					else {
-						/* Pop. */
-						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
-					}
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			/* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-			tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-			P_idir = P*idir;
-			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-			triangle_intersect_precalc(dir, &isect_precalc);
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
deleted file mode 100644
index 8a31775fae3..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint max_hits,
-                                             const uint visibility)
-{
-	/* TODO(sergey):
-	 * - Test if pushing distance on the stack helps.
-	 * - Likely and unlikely for if() statements.
-	 * - Test restrict attribute for pointers.
-	 */
-
-	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
-
-	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* Ray parameters in registers. */
-	const float tmax = ray->t;
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-	float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-	uint num_hits = 0;
-	isect_array->t = tmax;
-
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
-#if BVH_FEATURE(BVH_INSTANCING)
-	int num_hits_in_instance = 0;
-#endif
-
-	ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-	float3 P_idir = P*idir;
-	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-	/* Offsets to select the side that becomes the lower or upper bound. */
-	int near_x, near_y, near_z;
-	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* Traversal loop. */
-	do {
-		do {
-			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-#ifdef __VISIBILITY_FLAG__
-				float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
-				if((__float_as_uint(inodes.x) & visibility) == 0) {
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-					continue;
-				}
-#endif
-
-				ssef dist;
-				int traverseChild = NODE_INTERSECT(kg,
-				                                   tnear,
-				                                   tfar,
-#ifdef __KERNEL_AVX2__
-				                                   P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-				                                   org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-				                                   dir4,
-#endif
-				                                   idir4,
-				                                   near_x, near_y, near_z,
-				                                   far_x, far_y, far_z,
-				                                   nodeAddr,
-				                                   &dist);
-
-				if(traverseChild != 0) {
-					float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+13);
-					}
-					else
-#endif
-					{
-						cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+7);
-					}
-
-					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
-						continue;
-					}
-
-					/* Two children are hit, push far child, and continue with
-					 * closer child.
-					 */
-					int c0 = __float_as_int(cnodes[r]);
-					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
-					int c1 = __float_as_int(cnodes[r]);
-					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
-							continue;
-						}
-						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
-							continue;
-						}
-					}
-
-					/* Here starts the slow path for 3 or 4 hit children. We push
-					 * all nodes onto the stack to sort them there.
-					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
-
-					/* Three children are hit, push all onto stack and sort 3
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c2 = __float_as_int(cnodes[r]);
-					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
-						continue;
-					}
-
-					/* Four children are hit, push all onto stack and sort 4
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c3 = __float_as_int(cnodes[r]);
-					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
-				}
-
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
-			}
-
-			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-					const uint p_type = type & PRIMITIVE_ALL;
-					bool hit;
-
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
-
-					/* Primitive intersection. */
-					switch(p_type) {
-						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								/* Intersect ray against primitive. */
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
-								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
-									num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
-									num_hits_in_instance++;
-#endif
-									isect_array->t = isect_t;
-									if(num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
-#  if BVH_FEATURE(BVH_MOTION)
-										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-										for(int i = 0; i < num_hits_in_instance; i++) {
-											(isect_array-i-1)->t *= t_fac;
-										}
-#endif  /* BVH_FEATURE(BVH_INSTANCING) */
-										return num_hits;
-									}
-								}
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-									continue;
-								}
-								/* Intersect ray against primitive. */
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
-								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
-									num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
-									num_hits_in_instance++;
-#  endif
-									isect_array->t = isect_t;
-									if(num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
-#    if BVH_FEATURE(BVH_MOTION)
-										float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
-										Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-										float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
-										for(int i = 0; i < num_hits_in_instance; i++) {
-											(isect_array-i-1)->t *= t_fac;
-										}
-#  endif  /* BVH_FEATURE(BVH_INSTANCING) */
-										return num_hits;
-									}
-								}
-							}
-							break;
-						}
-#endif
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-					int object_flag = kernel_tex_fetch(__object_flag, object);
-
-					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
-#  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
-#  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
-#  endif
-
-						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-						tfar = ssef(isect_t);
-						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  if BVH_FEATURE(BVH_HAIR)
-						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-#  ifdef __KERNEL_AVX2__
-						P_idir = P*idir;
-						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-						triangle_intersect_precalc(dir, &isect_precalc);
-						num_hits_in_instance = 0;
-						isect_array->t = isect_t;
-
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
-
-						nodeAddr = kernel_tex_fetch(__object_node, object);
-					}
-					else {
-						/* Pop. */
-						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
-					}
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			/* Instance pop. */
-			if(num_hits_in_instance) {
-				float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
-				/* Scale isect->t to adjust for instancing. */
-				for(int i = 0; i < num_hits_in_instance; i++) {
-					(isect_array-i-1)->t *= t_fac;
-				}
-			}
-			else {
-				float ignore_t = FLT_MAX;
-#  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
-#  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
-#  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
-			}
-
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-			tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-			P_idir = P*idir;
-			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-			triangle_intersect_precalc(dir, &isect_precalc);
-			isect_t = tmax;
-			isect_array->t = isect_t;
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return num_hits;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 3c3503eab8b..d5b31037723 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -25,6 +25,7 @@
 #include "kernel_camera.h"
 
 #include "geom/geom.h"
+#include "bvh/bvh.h"
 
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index aad06ed5c76..37907cd8fdc 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -35,6 +35,7 @@
 #  include "../../kernel_montecarlo.h"
 #  include "../../kernel_projection.h"
 #  include "../../geom/geom.h"
+#  include "../../bvh/bvh.h"
 
 #  include "../../kernel_accumulate.h"
 #  include "../../kernel_camera.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index acc6887cb17..2bb2be5e6b3 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -47,6 +47,7 @@
 #include "kernel_camera.h"
 #include "kernels/cpu/kernel_cpu_image.h"
 #include "geom/geom.h"
+#include "bvh/bvh.h"
 
 #include "kernel_projection.h"
 #include "kernel_accumulate.h"
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index e1c7e2cea99..88d6dab04d0 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -31,6 +31,7 @@
 #include "kernel_camera.h"
 
 #include "geom/geom.h"
+#include "bvh/bvh.h"
 
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
-- 
cgit v1.2.3