From 84470a1190b28cd37491e5002aea4695e4f26f44 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sat, 29 Mar 2014 13:03:45 +0100
Subject: Cycles code refactor: move geometry related kernel files into own
 directory.

---
 intern/cycles/kernel/CMakeLists.txt             |   20 +-
 intern/cycles/kernel/SConscript                 |    5 +-
 intern/cycles/kernel/geom/geom_bvh.h            | 1322 +++++++++++++++++++++++
 intern/cycles/kernel/geom/geom_bvh_subsurface.h |  294 +++++
 intern/cycles/kernel/geom/geom_bvh_traversal.h  |  354 ++++++
 intern/cycles/kernel/geom/geom_curve.h          |  137 +++
 intern/cycles/kernel/geom/geom_object.h         |  300 +++++
 intern/cycles/kernel/geom/geom_triangle.h       |  180 +++
 intern/cycles/kernel/kernel_bvh.h               | 1318 ----------------------
 intern/cycles/kernel/kernel_bvh_subsurface.h    |  294 -----
 intern/cycles/kernel/kernel_bvh_traversal.h     |  354 ------
 intern/cycles/kernel/kernel_curve.h             |  137 ---
 intern/cycles/kernel/kernel_object.h            |  300 -----
 intern/cycles/kernel/kernel_path.h              |    9 +-
 intern/cycles/kernel/kernel_triangle.h          |  180 ---
 intern/cycles/kernel/osl/osl_services.cpp       |    9 +-
 intern/cycles/kernel/osl/osl_shader.cpp         |    3 +-
 17 files changed, 2612 insertions(+), 2604 deletions(-)
 create mode 100644 intern/cycles/kernel/geom/geom_bvh.h
 create mode 100644 intern/cycles/kernel/geom/geom_bvh_subsurface.h
 create mode 100644 intern/cycles/kernel/geom/geom_bvh_traversal.h
 create mode 100644 intern/cycles/kernel/geom/geom_curve.h
 create mode 100644 intern/cycles/kernel/geom/geom_object.h
 create mode 100644 intern/cycles/kernel/geom/geom_triangle.h
 delete mode 100644 intern/cycles/kernel/kernel_bvh.h
 delete mode 100644 intern/cycles/kernel/kernel_bvh_subsurface.h
 delete mode 100644 intern/cycles/kernel/kernel_bvh_traversal.h
 delete mode 100644 intern/cycles/kernel/kernel_curve.h
 delete mode 100644 intern/cycles/kernel/kernel_object.h
 delete mode 100644 intern/cycles/kernel/kernel_triangle.h

(limited to 'intern/cycles/kernel')

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index ebeebe20c0f..ccefb314894 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -19,14 +19,10 @@ set(SRC
 set(SRC_HEADERS
 	kernel.h
 	kernel_accumulate.h
-	kernel_bvh.h
-	kernel_bvh_subsurface.h
-	kernel_bvh_traversal.h
 	kernel_camera.h
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
-	kernel_curve.h
 	kernel_differential.h
 	kernel_displace.h
 	kernel_emission.h
@@ -36,7 +32,6 @@ set(SRC_HEADERS
 	kernel_light.h
 	kernel_math.h
 	kernel_montecarlo.h
-	kernel_object.h
 	kernel_passes.h
 	kernel_path.h
 	kernel_path_state.h
@@ -47,7 +42,6 @@ set(SRC_HEADERS
 	kernel_shadow.h
 	kernel_subsurface.h
 	kernel_textures.h
-	kernel_triangle.h
 	kernel_types.h
 	kernel_volume.h
 )
@@ -114,6 +108,15 @@ set(SRC_SVM_HEADERS
 	svm/svm_wave.h
 )
 
+set(SRC_GEOM_HEADERS
+	geom/geom_bvh.h
+	geom/geom_bvh_subsurface.h
+	geom/geom_bvh_traversal.h
+	geom/geom_curve.h
+	geom/geom_object.h
+	geom/geom_triangle.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_color.h
 	../util/util_half.h
@@ -146,7 +149,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
@@ -222,7 +225,7 @@ if(CXX_HAS_SSE)
 endif()
 
 
-add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS})
+add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS})
 
 if(WITH_CYCLES_CUDA)
 	add_dependencies(cycles_kernel cycles_kernel_cuda)
@@ -243,5 +246,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 5077d8c96b0..b2eafe6a83d 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -60,6 +60,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     kernel_file = os.path.join(source_dir, "kernel.cu")
     util_dir = os.path.join(source_dir, "../util")
     svm_dir = os.path.join(source_dir, "../svm")
+    geom_dir = os.path.join(source_dir, "../geom")
     closure_dir = os.path.join(source_dir, "../closure")
 
     # get CUDA version
@@ -76,10 +77,10 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     nvcc_flags += " --cubin --ptxas-options=\"-v\""
     nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
-    nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, closure_dir)
+    nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir)
 
     # dependencies
-    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('closure/*.h')
+    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
     # add command for each cuda architecture
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
new file mode 100644
index 00000000000..0272dff5115
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -0,0 +1,1322 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * "Persistent while-while kernel" used in:
+ *
+ * "Understanding the Efficiency of Ray Traversal on GPUs",
+ * Timo Aila and Samuli Laine,
+ * Proc. High-Performance Graphics 2009
+ */
+
+/* bottom-most stack entry, indicating the end of traversal */
+#define ENTRYPOINT_SENTINEL 0x76543210
+
+/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
+#define BVH_STACK_SIZE 192
+#define BVH_NODE_SIZE 4
+#define TRI_NODE_SIZE 3
+
+/* silly workaround for float extended precision that happens when compiling
+ * without sse support on x86, it results in different results for float ops
+ * that you would otherwise expect to compare correctly */
+#if !defined(__i386__) || defined(__SSE__)
+#define NO_EXTENDED_PRECISION
+#else
+#define NO_EXTENDED_PRECISION volatile
+#endif
+
+#include "geom_object.h"
+#include "geom_curve.h"
+#include "geom_triangle.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline float3 bvh_inverse_direction(float3 dir)
+{
+	/* avoid divide by zero (ooeps = exp2f(-80.0f)) */
+	float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
+	float3 idir;
+
+	idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
+	idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
+	idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
+
+	return idir;
+}
+
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+
+	*P = transform_point(&tfm, ray->P);
+
+	float3 dir = transform_direction(&tfm, ray->D);
+
+	float len;
+	dir = normalize_len(dir, &len);
+
+	*idir = bvh_inverse_direction(dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+}
+
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
+{
+	if(*t != FLT_MAX) {
+		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+		*t *= len(transform_direction(&tfm, 1.0f/(*idir)));
+	}
+
+	*P = ray->P;
+	*idir = bvh_inverse_direction(ray->D);
+}
+
+#ifdef __OBJECT_MOTION__
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
+{
+	Transform itfm;
+	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
+
+	*P = transform_point(&itfm, ray->P);
+
+	float3 dir = transform_direction(&itfm, ray->D);
+
+	float len;
+	dir = normalize_len(dir, &len);
+
+	*idir = bvh_inverse_direction(dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+}
+
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
+{
+	if(*t != FLT_MAX)
+		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
+
+	*P = ray->P;
+	*idir = bvh_inverse_direction(ray->D);
+}
+#endif
+
+/* Sven Woop's algorithm */
+ccl_device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 idir, uint visibility, int object, int triAddr)
+{
+	/* compute and check intersection t-value */
+	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
+	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
+	float3 dir = 1.0f/idir;
+
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
+	float t = Oz * invDz;
+
+	if(t > 0.0f && t < isect->t) {
+		/* compute and check barycentric u */
+		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
+		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
+		float u = Ox + t*Dx;
+
+		if(u >= 0.0f) {
+			/* compute and check barycentric v */
+			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
+			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
+			float v = Oy + t*Dy;
+
+			if(v >= 0.0f && u + v <= 1.0f) {
+#ifdef __VISIBILITY_FLAG__
+				/* visibility flag test. we do it here under the assumption
+				 * that most triangles are culled by node flags */
+				if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+				{
+					/* record intersection */
+					isect->prim = triAddr;
+					isect->object = object;
+					isect->u = u;
+					isect->v = v;
+					isect->t = t;
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+#ifdef __HAIR__
+ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3)
+{
+	float halfdiscroot = (p2 * p2 - 3 * p3 * p1);
+	float ta = -1.0f;
+	float tb = -1.0f;
+	*extremta = -1.0f;
+	*extremtb = -1.0f;
+	*upper = p0;
+	*lower = p0 + p1 + p2 + p3;
+	*extrema = *upper;
+	*extremb = *lower;
+	if(*lower >= *upper) {
+		*upper = *lower;
+		*lower = p0;
+	}
+
+	if(halfdiscroot >= 0) {
+		halfdiscroot = sqrt(halfdiscroot);
+		ta = (-p2 - halfdiscroot) / (3 * p3);
+		tb = (-p2 + halfdiscroot) / (3 * p3);
+	}
+
+	float t2;
+	float t3;
+	if(ta > 0.0f && ta < 1.0f) {
+		t2 = ta * ta;
+		t3 = t2 * ta;
+		*extremta = ta;
+		*extrema = p3 * t3 + p2 * t2 + p1 * ta + p0;
+		if(*extrema > *upper) {
+			*upper = *extrema;
+		}
+		if(*extrema < *lower) {
+			*lower = *extrema;
+		}
+	}
+	if(tb > 0.0f && tb < 1.0f) {
+		t2 = tb * tb;
+		t3 = t2 * tb;
+		*extremtb = tb;
+		*extremb = p3 * t3 + p2 * t2 + p1 * tb + p0;
+		if(*extremb >= *upper) {
+			*upper = *extremb;
+		}
+		if(*extremb <= *lower) {
+			*lower = *extremb;
+		}
+	}
+}
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
+{
+	return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2])));
+}
+#endif
+
+#ifdef __KERNEL_SSE2__
+/* Pass P and idir by reference to aligned vector */
+ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+	const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
+#else
+ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
+#endif
+{
+	float epsilon = 0.0f;
+	float r_st, r_en;
+
+	int depth = kernel_data.curve.subdivisions;
+	int flags = kernel_data.curve.curveflags;
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+	__m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
+	__m128 vcurve_coef[4];
+	const float3 *curve_coef = (float3 *)vcurve_coef;
+	
+	{
+		__m128 dtmp = _mm_mul_ps(vdir, vdir);
+		__m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp)));
+		__m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss);
+
+		__m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]);
+		int2 &v00 = (int2 &)v00vec;
+
+		int k0 = v00.x + segment;
+		int k1 = k0 + 1;
+		int ka = max(k0 - 1, v00.x);
+		int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+		__m128 P0 = _mm_load_ps(&kg->__curve_keys.data[ka].x);
+		__m128 P1 = _mm_load_ps(&kg->__curve_keys.data[k0].x);
+		__m128 P2 = _mm_load_ps(&kg->__curve_keys.data[k1].x);
+		__m128 P3 = _mm_load_ps(&kg->__curve_keys.data[kb].x);
+
+		__m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss));
+		__m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn);
+		__m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy);
+		__m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		__m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)));
+
+		__m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		__m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0);
+		__m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+		__m128 htfm[] = { htfm0, htfm1, htfm2 };
+		__m128 vP = load_m128(P);
+		__m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, vP));
+		__m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, vP));
+		__m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, vP));
+		__m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, vP));
+
+		float fc = 0.71f;
+		__m128 vfc = _mm_set1_ps(fc);
+		__m128 vfcxp3 = _mm_mul_ps(vfc, p3);
+
+		vcurve_coef[0] = p1;
+		vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0));
+		vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3));
+
+		r_st = ((float4 &)P1).w;
+		r_en = ((float4 &)P2).w;
+	}
+#else
+	float3 curve_coef[4];
+
+	/* curve Intersection check */
+	float3 dir = 1.0f/idir;
+
+	/* obtain curve parameters */
+	{
+		/* ray transform created - this should be created at beginning of intersection loop */
+		Transform htfm;
+		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+		htfm = make_transform(
+			dir.z / d, 0, -dir.x /d, 0,
+			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+			dir.x, dir.y, dir.z, 0,
+			0, 0, 0, 1);
+
+		float4 v00 = kernel_tex_fetch(__curves, prim);
+
+		int k0 = __float_as_int(v00.x) + segment;
+		int k1 = k0 + 1;
+
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P0 = kernel_tex_fetch(__curve_keys, ka);
+		float4 P1 = kernel_tex_fetch(__curve_keys, k0);
+		float4 P2 = kernel_tex_fetch(__curve_keys, k1);
+		float4 P3 = kernel_tex_fetch(__curve_keys, kb);
+
+		float3 p0 = transform_point(&htfm, float4_to_float3(P0) - P);
+		float3 p1 = transform_point(&htfm, float4_to_float3(P1) - P);
+		float3 p2 = transform_point(&htfm, float4_to_float3(P2) - P);
+		float3 p3 = transform_point(&htfm, float4_to_float3(P3) - P);
+
+		float fc = 0.71f;
+		curve_coef[0] = p1;
+		curve_coef[1] = -fc*p0 + fc*p2;
+		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+		r_st = P1.w;
+		r_en = P2.w;
+	}
+#endif
+
+	float r_curr = max(r_st, r_en);
+
+	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+		epsilon = 2 * r_curr;
+
+	/* find bounds - this is slow for cubic curves */
+	float upper, lower;
+
+	float zextrem[4];
+	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+		return false;
+
+	/* minimum width extension */
+	float mw_extension = min(difl * fabsf(upper), extmax);
+	float r_ext = mw_extension + r_curr;
+
+	float xextrem[4];
+	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	float yextrem[4];
+	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	/* setup recurrent loop */
+	int level = 1 << depth;
+	int tree = 0;
+	float resol = 1.0f / (float)level;
+	bool hit = false;
+
+	/* begin loop */
+	while(!(tree >> (depth))) {
+		float i_st = tree * resol;
+		float i_en = i_st + (level * resol);
+#ifdef __KERNEL_SSE2__
+		__m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en);
+		__m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		__m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+		__m128 vbmin = _mm_min_ps(vp_st, vp_en);
+		__m128 vbmax = _mm_max_ps(vp_st, vp_en);
+
+		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+		
+		float bminx = min(p_st.x, p_en.x);
+		float bmaxx = max(p_st.x, p_en.x);
+		float bminy = min(p_st.y, p_en.y);
+		float bmaxy = max(p_st.y, p_en.y);
+		float bminz = min(p_st.z, p_en.z);
+		float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+			bminx = min(bminx,xextrem[1]);
+			bmaxx = max(bmaxx,xextrem[1]);
+		}
+		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+			bminx = min(bminx,xextrem[3]);
+			bmaxx = max(bmaxx,xextrem[3]);
+		}
+		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+			bminy = min(bminy,yextrem[1]);
+			bmaxy = max(bmaxy,yextrem[1]);
+		}
+		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+			bminy = min(bminy,yextrem[3]);
+			bmaxy = max(bmaxy,yextrem[3]);
+		}
+		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+			bminz = min(bminz,zextrem[1]);
+			bmaxz = max(bmaxz,zextrem[1]);
+		}
+		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+			bminz = min(bminz,zextrem[3]);
+			bmaxz = max(bmaxz,zextrem[3]);
+		}
+
+		float r1 = r_st + (r_en - r_st) * i_st;
+		float r2 = r_st + (r_en - r_st) * i_en;
+		r_curr = max(r1, r2);
+
+		mw_extension = min(difl * fabsf(bmaxz), extmax);
+		float r_ext = mw_extension + r_curr;
+		float coverage = 1.0f;
+
+		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+			/* the bounding box does not overlap the square centered at O */
+			tree += level;
+			level = tree & -tree;
+		}
+		else if (level == 1) {
+
+			/* the maximum recursion depth is reached.
+			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			* dP* is reversed if necessary.*/
+			float t = isect->t;
+			float u = 0.0f;
+			if(flags & CURVE_KN_RIBBONS) {
+				float3 tg = (p_en - p_st);
+				float w = tg.x * tg.x + tg.y * tg.y;
+				if (w == 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+				w = clamp((float)w, 0.0f, 1.0f);
+
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r_st + (r_en - r_st) * u;
+				/* compare x-y distances */
+				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if (dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if (dot(tg, dp_en) < 0)
+					dp_en *= -1;
+				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				/* compute coverage */
+				float r_ext = r_curr;
+				coverage = 1.0f;
+				if(difl != 0.0f) {
+					mw_extension = min(difl * fabsf(bmaxz), extmax);
+					r_ext = mw_extension + r_curr;
+					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+					float d0 = d - r_curr;
+					float d1 = d + r_curr;
+					if (d0 >= 0)
+						coverage = (min(d1 / mw_extension, 1.0f) - min(d0 / mw_extension, 1.0f)) * 0.5f;
+					else // inside
+						coverage = (min(d1 / mw_extension, 1.0f) + min(-d0 / mw_extension, 1.0f)) * 0.5f;
+				}
+				
+				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				t = p_curr.z;
+			}
+			else {
+				float l = len(p_en - p_st);
+				/* minimum width extension */
+				float or1 = r1;
+				float or2 = r2;
+				if(difl != 0.0f) {
+					mw_extension = min(len(p_st - P) * difl, extmax);
+					or1 = r1 < mw_extension ? mw_extension : r1;
+					mw_extension = min(len(p_en - P) * difl, extmax);
+					or2 = r2 < mw_extension ? mw_extension : r2;
+				}
+				/* --- */
+				float3 tg = (p_en - p_st) / l;
+				float gd = (or2 - or1) / l;
+				float difz = -dot(p_st,tg);
+				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+				float tcentre = -halfb/cyla;
+				float zcentre = difz + (tg.z * tcentre);
+				float3 tdif = - p_st;
+				tdif.z += tcentre;
+				float tdifz = dot(tdif,tg);
+				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+				float td = tb*tb - 4*cyla*tc;
+				if (td < 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				
+				float rootd = sqrtf(td);
+				float correction = ((-tb - rootd)/(2*cyla));
+				t = tcentre + correction;
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if (dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if (dot(tg, dp_en) < 0)
+					dp_en *= -1;
+
+				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+					correction = ((-tb + rootd)/(2*cyla));
+					t = tcentre + correction;
+				}			
+
+				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float w = (zcentre + (tg.z * correction))/l;
+				w = clamp((float)w, 0.0f, 1.0f);
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r1 + (r2 - r1) * w;
+				r_ext = or1 + (or2 - or1) * w;
+				coverage = r_curr/r_ext;
+
+			}
+			/* we found a new intersection */
+
+			/* stochastic fade from minimum width */
+			if(lcg_state && coverage != 1.0f) {
+				if(lcg_step_float(lcg_state) > coverage)
+					return hit;
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->prim = curveAddr;
+				isect->segment = segment;
+				isect->object = object;
+				isect->u = u;
+				isect->v = 0.0f;
+				/*isect->v = 1.0f - coverage; */
+				isect->t = t;
+				hit = true;
+			}
+			
+			tree++;
+			level = tree & -tree;
+		}
+		else {
+			/* split the curve into two curves and process */
+			level = level >> 1;
+		}
+	}
+
+	return hit;
+}
+
+ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
+{
+	/* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+#define len3_squared(x) len_squared(x)
+#define len3(x) len(x)
+#define dot3(x, y) dot(x, y)
+#endif
+
+	/* curve Intersection check */
+	int flags = kernel_data.curve.curveflags;
+
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int cnum = __float_as_int(v00.x);
+	int k0 = cnum + segment;
+	int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
+	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
+
+	float or1 = P1.w;
+	float or2 = P2.w;
+	float3 p1 = float4_to_float3(P1);
+	float3 p2 = float4_to_float3(P2);
+
+	/* minimum width extension */
+	float r1 = or1;
+	float r2 = or2;
+	float3 dif = P - p1;
+	float3 dif_second = P - p2;
+	if(difl != 0.0f) {
+		float pixelsize = min(len3(dif) * difl, extmax);
+		r1 = or1 < pixelsize ? pixelsize : or1;
+		pixelsize = min(len3(dif_second) * difl, extmax);
+		r2 = or2 < pixelsize ? pixelsize : or2;
+	}
+	/* --- */
+
+	float3 dir = 1.0f / idir;
+	float3 p21_diff = p2 - p1;
+	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float sphere_b_tmp = dot3(dir, sphere_dif1);
+	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+	const __m128 p1 = _mm_load_ps(&kg->__curve_keys.data[k0].x);
+	const __m128 p2 = _mm_load_ps(&kg->__curve_keys.data[k1].x);
+	const __m128 or12 = shuffle<3, 3, 3, 3>(p1, p2);
+
+	__m128 r12 = or12;
+	const __m128 vP = load_m128(P);
+	const __m128 dif = _mm_sub_ps(vP, p1);
+	const __m128 dif_second = _mm_sub_ps(vP, p2);
+	if(difl != 0.0f) {
+		const __m128 len1_sq = len3_squared_splat(dif);
+		const __m128 len2_sq = len3_squared_splat(dif_second);
+		const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax));
+		r12 = _mm_max_ps(or12, pixelsize12);
+	}
+	float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
+	float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
+
+	const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
+	const __m128 p21_diff = _mm_sub_ps(p2, p1);
+	const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
+	const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+	float mr = max(r1, r2);
+	float l = len3(p21_diff);
+	float invl = 1.0f / l;
+	float sp_r = mr + 0.5f * l;
+
+	float sphere_b = dot3(dir, sphere_dif2);
+	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+	if(sdisc < 0.0f)
+		return false;
+
+	/* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+	float3 tg = p21_diff * invl;
+#else
+	const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl));
+#endif
+	float gd = (r2 - r1) * invl;
+
+	float dirz = dot3(dir, tg);
+	float difz = dot3(dif, tg);
+
+	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+	float tcentre = -halfb/a;
+	float zcentre = difz + (dirz * tcentre);
+
+	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+		return false;
+	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+		return false;
+
+	/* test minimum separation */
+#ifndef __KERNEL_SSE2__
+	float3 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross(tg, dif));
+#else
+	const __m128 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+	float cprodsq = len3_squared(cprod);
+	float distscaled = dot3(cprod, dif);
+
+	if(cprodsq == 0)
+		distscaled = cprod2sq;
+	else
+		distscaled = (distscaled*distscaled)/cprodsq;
+
+	if(distscaled > mr*mr)
+		return false;
+
+	/* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+	float3 tdif = dif + tcentre * dir;
+#else
+	const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif);
+#endif
+	float tdifz = dot3(tdif, tg);
+	float tdifma = tdifz*gd + r1;
+	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+	float td = tb*tb - 4*a*tc;
+
+	if (td < 0.0f)
+		return false;
+
+	float rootd = 0.0f;
+	float correction = 0.0f;
+	if(flags & CURVE_KN_ACCURATE) {
+		rootd = sqrtf(td);
+		correction = ((-tb - rootd)/(2*a));
+	}
+
+	float t = tcentre + correction;
+
+	if(t < isect->t) {
+
+		if(flags & CURVE_KN_INTERSECTCORRECTION) {
+			rootd = sqrtf(td);
+			correction = ((-tb - rootd)/(2*a));
+			t = tcentre + correction;
+		}
+
+		float z = zcentre + (dirz * correction);
+		bool backface = false;
+
+		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+			backface = true;
+			correction = ((-tb + rootd)/(2*a));
+			t = tcentre + correction;
+			z = zcentre + (dirz * correction);
+		}
+
+		/* stochastic fade from minimum width */
+		float adjradius = or1 + z * (or2 - or1) * invl;
+		adjradius = adjradius / (r1 + z * gd);
+		if(lcg_state && adjradius != 1.0f) {
+			if(lcg_step_float(lcg_state) > adjradius)
+				return false;
+		}
+		/* --- */
+
+		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+			if (flags & CURVE_KN_ENCLOSEFILTER) {
+				float enc_ratio = 1.01f;
+				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+					if(a2*c2 < 0.0f)
+						return false;
+				}
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->prim = curveAddr;
+				isect->segment = segment;
+				isect->object = object;
+				isect->u = z*invl;
+				isect->v = td/(4*a*a);
+				/*isect->v = 1.0f - adjradius;*/
+				isect->t = t;
+
+				if(backface) 
+					isect->u = -isect->u;
+				
+				return true;
+			}
+		}
+	}
+
+	return false;
+
+#ifndef __KERNEL_SSE2__
+#undef len3_squared
+#undef len3
+#undef dot3
+#endif
+}
+#endif
+
+#ifdef __SUBSURFACE__
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point. */
+
+ccl_device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
+	float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
+{
+	/* compute and check intersection t-value */
+	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
+	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
+	float3 dir = 1.0f/idir;
+
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
+	float t = Oz * invDz;
+
+	if(t > 0.0f && t < tmax) {
+		/* compute and check barycentric u */
+		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
+		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
+		float u = Ox + t*Dx;
+
+		if(u >= 0.0f) {
+			/* compute and check barycentric v */
+			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
+			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
+			float v = Oy + t*Dy;
+
+			if(v >= 0.0f && u + v <= 1.0f) {
+				(*num_hits)++;
+
+				int hit;
+
+				if(*num_hits <= max_hits) {
+					hit = *num_hits - 1;
+				}
+				else {
+					/* reservoir sampling: if we are at the maximum number of
+					 * hits, randomly replace element or skip it */
+					hit = lcg_step_uint(lcg_state) % *num_hits;
+
+					if(hit >= max_hits)
+						return;
+				}
+
+				/* record intersection */
+				Intersection *isect = &isect_array[hit];
+				isect->prim = triAddr;
+				isect->object = object;
+				isect->u = u;
+				isect->v = v;
+				isect->t = t;
+			}
+		}
+	}
+}
+#endif
+
+/* BVH intersection function variations */
+
+#define BVH_INSTANCING			1
+#define BVH_MOTION				2
+#define BVH_HAIR				4
+#define BVH_HAIR_MINIMUM_WIDTH	8
+
+#define BVH_FUNCTION_NAME bvh_intersect
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_traversal.h"
+
+#if defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__SUBSURFACE__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+#include "geom_bvh_subsurface.h"
+#endif
+
+/* to work around titan bug when using arrays instead of textures */
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+#ifdef __HAIR__ 
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
+#else
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+#endif
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_motion(kg, ray, isect, visibility);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__ 
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_instancing(kg, ray, isect, visibility);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect(kg, ray, isect, visibility);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_instancing(kg, ray, isect, visibility);
+#else
+	return bvh_intersect(kg, ray, isect, visibility);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+
+/* to work around titan bug when using arrays instead of textures */
+#ifdef __SUBSURFACE__
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__ 
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#else
+	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+#endif
+
+/* Ray offset to avoid self intersection */
+
+ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
+{
+#ifdef __INTERSECTION_REFINE__
+	const float epsilon_f = 1e-5f;
+	/* ideally this should match epsilon_f, but instancing/mblur
+	 * precision makes it problematic */
+	const float epsilon_test = 1.0f;
+	const int epsilon_i = 32;
+
+	float3 res;
+
+	/* x component */
+	if(fabsf(P.x) < epsilon_test) {
+		res.x = P.x + Ng.x*epsilon_f;
+	}
+	else {
+		uint ix = __float_as_uint(P.x);
+		ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
+		res.x = __uint_as_float(ix);
+	}
+
+	/* y component */
+	if(fabsf(P.y) < epsilon_test) {
+		res.y = P.y + Ng.y*epsilon_f;
+	}
+	else {
+		uint iy = __float_as_uint(P.y);
+		iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
+		res.y = __uint_as_float(iy);
+	}
+
+	/* z component */
+	if(fabsf(P.z) < epsilon_test) {
+		res.z = P.z + Ng.z*epsilon_f;
+	}
+	else {
+		uint iz = __float_as_uint(P.z);
+		iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
+		res.z = __uint_as_float(iz);
+	}
+
+	return res;
+#else
+	const float epsilon_f = 1e-4f;
+	return P + epsilon_f*Ng;
+#endif
+}
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance. */
+
+ccl_device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != ~0) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
+	float rt = Oz * invDz;
+
+	P = P + D*rt;
+
+	if(isect->object != ~0) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* same as above, except that isect->t is assumed to be in object space for instancing */
+ccl_device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != ~0) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
+	float rt = Oz * invDz;
+
+	P = P + D*rt;
+
+	if(isect->object != ~0) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+#ifdef __HAIR__
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float fc = 0.71f;
+	float data[4];
+	float t2 = t * t;
+	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
+	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
+	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
+	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float data[4];
+	float fc = 0.71f;
+	float t2 = t * t;
+	float t3 = t2 * t;
+	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
+	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
+	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
+	data[3] =  fc          * t3  - fc * t2;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+{
+	int flag = kernel_data.curve.curveflags;
+	float t = isect->t;
+	float3 P = ray->P;
+	float3 D = ray->D;
+
+	if(isect->object != ~0) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	int prim = kernel_tex_fetch(__prim_index, isect->prim);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int k0 = __float_as_int(v00.x) + isect->segment;
+	int k1 = k0 + 1;
+
+	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
+	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
+	float l = 1.0f;
+	float3 tg = normalize_len(float4_to_float3(P2 - P1), &l);
+	float r1 = P1.w;
+	float r2 = P2.w;
+	float gd = ((r2 - r1)/l);
+	
+	P = P + D*t;
+
+	if(flag & CURVE_KN_INTERPOLATE) {
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P0 = kernel_tex_fetch(__curve_keys, ka);
+		float4 P3 = kernel_tex_fetch(__curve_keys, kb);
+
+		float3 p[4];
+		p[0] = float4_to_float3(P0);
+		p[1] = float4_to_float3(P1);
+		p[2] = float4_to_float3(P2);
+		p[3] = float4_to_float3(P3);
+
+#ifdef __UV__
+		sd->u = isect->u;
+		sd->v = 0.0f;
+#endif
+	
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
+		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+		else {
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
+			sd->Ng = normalize(P - p_curr);
+			sd->Ng = sd->Ng - gd * tg;
+			sd->Ng = normalize(sd->Ng);
+		}
+		sd->N = sd->Ng;
+	}
+	else {
+		float3 dif = P - float4_to_float3(P1);
+
+#ifdef __UV__
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
+#endif
+
+		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
+		}
+		else {
+			sd->Ng = (dif - tg * sd->u * l) / (P1.w + sd->u * l * gd);
+			if (gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg ;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		sd->N = sd->Ng;
+	}
+
+#ifdef __DPDU__
+	/* dPdu/dPdv */
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+	/*add fading parameter for minimum pixel width with transparency bsdf*/
+	/*sd->curve_transparency = isect->v;*/
+	/*sd->curve_radius = sd->u * gd * l + r1;*/
+
+	if(isect->object != ~0) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
new file mode 100644
index 00000000000..40683a2da57
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -0,0 +1,294 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array,
+	int subsurface_object, uint *lcg_state, int max_hits)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - SSE for hair
+	 * - test restrict attribute for pointers
+	 */
+	
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 idir = bvh_inverse_direction(ray->D);
+	int object = ~0;
+	float isect_t = tmax;
+
+	const uint visibility = ~0;
+	uint num_hits = 0;
+
+#if FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
+	__m128 Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = _mm_set_ps1(P.x);
+	Psplat[1] = _mm_set_ps1(P.y);
+	Psplat[2] = _mm_set_ps1(P.z);
+
+	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	/* traversal loop */
+	do {
+		do
+		{
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
+			{
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect_t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+
+				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
+				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					union { __m128 m128; float v[4]; } uminmax;
+					uminmax.m128 = tminmax;
+					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					for(; primAddr < primAddr2; primAddr++) {
+#if FEATURE(BVH_HAIR)
+						uint segment = kernel_tex_fetch(__prim_segment, primAddr);
+						if(segment != ~0)
+							continue;
+#endif
+
+						/* only primitives from the same object */
+						uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object;
+
+						if(tri_object == subsurface_object) {
+
+							/* intersect ray against primitive */
+							bvh_triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+						}
+					}
+				}
+#if FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
+						object = subsurface_object;
+
+#if FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax);
+#else
+						bvh_instance_push(kg, object, ray, &P, &idir, &isect_t, tmax);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = _mm_set_ps1(P.x);
+						Psplat[1] = _mm_set_ps1(P.y);
+						Psplat[2] = _mm_set_ps1(P.z);
+
+						tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != ~0);
+
+			/* instance pop */
+#if FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &idir, &isect_t, tmax);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = _mm_set_ps1(P.x);
+			Psplat[1] = _mm_set_ps1(P.y);
+			Psplat[2] = _mm_set_ps1(P.z);
+
+			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = ~0;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+#undef FEATURE
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
new file mode 100644
index 00000000000..0515a9e0fa7
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -0,0 +1,354 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device bool BVH_FUNCTION_NAME
+(KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility
+#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+, uint *lcg_state, float difl, float extmax
+#endif
+)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - SSE for hair
+	 * - test restrict attribute for pointers
+	 */
+	
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 idir = bvh_inverse_direction(ray->D);
+	int object = ~0;
+
+#if FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = tmax;
+	isect->object = ~0;
+	isect->prim = ~0;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
+	__m128 Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = _mm_set_ps1(P.x);
+	Psplat[1] = _mm_set_ps1(P.y);
+	Psplat[2] = _mm_set_ps1(P.z);
+
+	__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	/* traversal loop */
+	do {
+		do
+		{
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
+			{
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					float hdiff = 1.0f + difl;
+					float ldiff = 1.0f - difl;
+					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+						c0min = max(ldiff * c0min, c0min - extmax);
+						c0max = min(hdiff * c0max, c0max + extmax);
+					}
+					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+						c1min = max(ldiff * c1min, c1min - extmax);
+						c1max = min(hdiff * c1max, c1max + extmax);
+					}
+				}
+#endif
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
+				const __m128 tminmax = _mm_xor_ps(minmax, pn);
+
+#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					float4 *tminmaxview = (float4*)&tminmax;
+					float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
+					float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
+
+					float hdiff = 1.0f + difl;
+					float ldiff = 1.0f - difl;
+					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+						c0min = max(ldiff * c0min, c0min - extmax);
+						c0max = min(hdiff * c0max, c0max + extmax);
+					}
+					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+						c1min = max(ldiff * c1min, c1min - extmax);
+						c1max = min(hdiff * c1max, c1max + extmax);
+					}
+				}
+#endif
+
+				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					union { __m128 m128; float v[4]; } uminmax;
+					uminmax.m128 = tminmax;
+					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					while(primAddr < primAddr2) {
+						bool hit;
+
+						/* intersect ray against primitive */
+#if FEATURE(BVH_HAIR)
+						uint segment = kernel_tex_fetch(__prim_segment, primAddr);
+						if(segment != ~0) {
+
+							if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+								hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
+							else
+								hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
+#else
+								hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+							else
+								hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+#endif
+						}
+						else
+#endif
+							hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
+
+						/* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+						if(hit) {
+							if(visibility == PATH_RAY_SHADOW_OPAQUE)
+								return true;
+
+							tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+						}
+#else
+						if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
+							return true;
+#endif
+
+						primAddr++;
+					}
+				}
+#if FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax);
+#else
+					bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+					Psplat[0] = _mm_set_ps1(P.x);
+					Psplat[1] = _mm_set_ps1(P.y);
+					Psplat[2] = _mm_set_ps1(P.z);
+
+					tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+					++stackPtr;
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+				}
+			}
+#endif
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != ~0);
+
+			/* instance pop */
+#if FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = _mm_set_ps1(P.x);
+			Psplat[1] = _mm_set_ps1(P.y);
+			Psplat[2] = _mm_set_ps1(P.z);
+
+			tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = ~0;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != ~0);
+}
+
+#undef FEATURE
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
new file mode 100644
index 00000000000..821ac50eaa9
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __HAIR__
+
+/* curve attributes */
+
+ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+{
+	if(elem == ATTR_ELEMENT_CURVE) {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+#endif
+
+		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+	}
+	else if(elem == ATTR_ELEMENT_CURVE_KEY) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + sd->segment;
+		int k1 = k0 + 1;
+
+		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
+		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dy) *dy = 0.0f;
+#endif
+
+		return (1.0f - sd->u)*f0 + sd->u*f1;
+	}
+	else {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+#endif
+
+		return 0.0f;
+	}
+}
+
+ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+{
+	if(elem == ATTR_ELEMENT_CURVE) {
+		/* idea: we can't derive any useful differentials here, but for tiled
+		 * mipmap image caching it would be useful to avoid reading the highest
+		 * detail level always. maybe a derivative based on the hair density
+		 * could be computed somehow? */
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+	}
+	else if(elem == ATTR_ELEMENT_CURVE_KEY) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + sd->segment;
+		int k1 = k0 + 1;
+
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+		return (1.0f - sd->u)*f0 + sd->u*f1;
+	}
+	else {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+/* hair info node functions */
+
+ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+{
+	float r = 0.0f;
+
+	if(sd->segment != ~0) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + sd->segment;
+		int k1 = k0 + 1;
+
+		float4 P1 = kernel_tex_fetch(__curve_keys, k0);
+		float4 P2 = kernel_tex_fetch(__curve_keys, k1);
+		r = (P2.w - P1.w) * sd->u + P1.w;
+	}
+
+	return r*2.0f;
+}
+
+ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+{	
+	float3 tgN = make_float3(0.0f,0.0f,0.0f);
+
+	if(sd->segment != ~0) {
+
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
+		tgN = normalize(tgN);
+
+		/* need to find suitable scaled gd for corrected normal */
+#if 0
+		tgN = normalize(tgN - gd * sd->dPdu);
+#endif
+	}
+
+	return tgN;
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
new file mode 100644
index 00000000000..a66277e10cd
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+enum ObjectTransform {
+	OBJECT_TRANSFORM = 0,
+	OBJECT_TRANSFORM_MOTION_PRE = 0,
+	OBJECT_INVERSE_TRANSFORM = 4,
+	OBJECT_TRANSFORM_MOTION_POST = 4,
+	OBJECT_PROPERTIES = 8,
+	OBJECT_DUPLI = 9
+};
+
+enum ObjectVectorTransform {
+	OBJECT_VECTOR_MOTION_PRE = 0,
+	OBJECT_VECTOR_MOTION_POST = 3
+};
+
+ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type)
+{
+	int offset = object*OBJECT_SIZE + (int)type;
+
+	Transform tfm;
+	tfm.x = kernel_tex_fetch(__objects, offset + 0);
+	tfm.y = kernel_tex_fetch(__objects, offset + 1);
+	tfm.z = kernel_tex_fetch(__objects, offset + 2);
+	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return tfm;
+}
+
+ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
+{
+	int offset = object*OBJECT_VECTOR_SIZE + (int)type;
+
+	Transform tfm;
+	tfm.x = kernel_tex_fetch(__objects_vector, offset + 0);
+	tfm.y = kernel_tex_fetch(__objects_vector, offset + 1);
+	tfm.z = kernel_tex_fetch(__objects_vector, offset + 2);
+	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return tfm;
+}
+
+#ifdef __OBJECT_MOTION__
+ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time)
+{
+	DecompMotionTransform motion;
+
+	int offset = object*OBJECT_SIZE + (int)OBJECT_TRANSFORM_MOTION_PRE;
+
+	motion.mid.x = kernel_tex_fetch(__objects, offset + 0);
+	motion.mid.y = kernel_tex_fetch(__objects, offset + 1);
+	motion.mid.z = kernel_tex_fetch(__objects, offset + 2);
+	motion.mid.w = kernel_tex_fetch(__objects, offset + 3);
+
+	motion.pre_x = kernel_tex_fetch(__objects, offset + 4);
+	motion.pre_y = kernel_tex_fetch(__objects, offset + 5);
+	motion.post_x = kernel_tex_fetch(__objects, offset + 6);
+	motion.post_y = kernel_tex_fetch(__objects, offset + 7);
+
+	Transform tfm;
+	transform_motion_interpolate(&tfm, &motion, time);
+
+	return tfm;
+}
+
+ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm)
+{
+	int object_flag = kernel_tex_fetch(__object_flag, object);
+
+	if(object_flag & SD_OBJECT_MOTION) {
+		/* if we do motion blur */
+		Transform tfm = object_fetch_transform_motion(kg, object, time);
+
+		if(itfm)
+			*itfm = transform_quick_inverse(tfm);
+
+		return tfm;
+	}
+	else {
+		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+		if(itfm)
+			*itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+
+		return tfm;
+	}
+}
+#endif
+
+ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
+{
+#ifdef __OBJECT_MOTION__
+	*P = transform_point(&sd->ob_tfm, *P);
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	*P = transform_point(&tfm, *P);
+#endif
+}
+
+ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
+{
+#ifdef __OBJECT_MOTION__
+	*P = transform_point(&sd->ob_itfm, *P);
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	*P = transform_point(&tfm, *P);
+#endif
+}
+
+ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
+{
+#ifdef __OBJECT_MOTION__
+	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	*N = normalize(transform_direction_transposed(&tfm, *N));
+#endif
+}
+
+ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
+{
+#ifdef __OBJECT_MOTION__
+	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	*N = normalize(transform_direction_transposed(&tfm, *N));
+#endif
+}
+
+ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
+{
+#ifdef __OBJECT_MOTION__
+	*D = transform_direction(&sd->ob_tfm, *D);
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	*D = transform_direction(&tfm, *D);
+#endif
+}
+
+ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
+{
+#ifdef __OBJECT_MOTION__
+	*D = transform_direction(&sd->ob_itfm, *D);
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	*D = transform_direction(&tfm, *D);
+#endif
+}
+
+ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
+{
+	if(sd->object == ~0)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+#ifdef __OBJECT_MOTION__
+	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+#else
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+#endif
+}
+
+ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
+{
+	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
+	float4 f = kernel_tex_fetch(__objects, offset);
+	return f.x;
+}
+
+ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
+{
+	if(object == ~0)
+		return 0.0f;
+
+	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
+	float4 f = kernel_tex_fetch(__objects, offset);
+	return f.y;
+}
+
+ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
+{
+	if(object == ~0)
+		return 0.0f;
+
+	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
+	float4 f = kernel_tex_fetch(__objects, offset);
+	return f.z;
+}
+
+ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object)
+{
+	if(object == ~0)
+		return 0.0f;
+
+	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
+	float4 f = kernel_tex_fetch(__objects, offset);
+	return __float_as_uint(f.w);
+}
+
+ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+{
+	if(object == ~0)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
+	float4 f = kernel_tex_fetch(__objects, offset);
+	return make_float3(f.x, f.y, f.z);
+}
+
+ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+{
+	if(object == ~0)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
+	float4 f = kernel_tex_fetch(__objects, offset + 1);
+	return make_float3(f.x, f.y, 0.0f);
+}
+
+
+ccl_device int shader_pass_id(KernelGlobals *kg, ShaderData *sd)
+{
+	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+}
+
+ccl_device_inline float particle_index(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f = kernel_tex_fetch(__particles, offset + 0);
+	return f.x;
+}
+
+ccl_device float particle_age(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f = kernel_tex_fetch(__particles, offset + 0);
+	return f.y;
+}
+
+ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f = kernel_tex_fetch(__particles, offset + 0);
+	return f.z;
+}
+
+ccl_device float particle_size(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f = kernel_tex_fetch(__particles, offset + 0);
+	return f.w;
+}
+
+ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f = kernel_tex_fetch(__particles, offset + 1);
+	return f;
+}
+
+ccl_device float3 particle_location(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f = kernel_tex_fetch(__particles, offset + 2);
+	return make_float3(f.x, f.y, f.z);
+}
+
+ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f2 = kernel_tex_fetch(__particles, offset + 2);
+	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
+	return make_float3(f2.w, f3.x, f3.y);
+}
+
+ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+{
+	int offset = particle*PARTICLE_SIZE;
+	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
+	float4 f4 = kernel_tex_fetch(__particles, offset + 4);
+	return make_float3(f3.z, f3.w, f4.x);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
new file mode 100644
index 00000000000..0455df85961
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Point on triangle for Moller-Trumbore triangles */
+ccl_device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v)
+{
+	/* load triangle vertices */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
+
+	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+
+	/* compute point */
+	float t = 1.0f - u - v;
+	return (u*v0 + v*v1 + t*v2);
+}
+
+/* Normal for Moller-Trumbore triangles */
+ccl_device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader)
+{
+#if 0
+	/* load triangle vertices */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
+
+	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+
+	/* compute normal */
+	return normalize(cross(v2 - v0, v1 - v0));
+#else
+	float4 Nm = kernel_tex_fetch(__tri_normal, tri_index);
+	*shader = __float_as_int(Nm.w);
+	return make_float3(Nm.x, Nm.y, Nm.z);
+#endif
+}
+
+/* Return 3 triangle vertex locations */
+ccl_device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3])
+{
+	/* load triangle vertices */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
+
+	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+}
+
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v)
+{
+	/* load triangle vertices */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
+
+	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
+	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
+	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
+
+	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+}
+
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri)
+{
+	/* fetch triangle vertex coordinates */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri));
+
+	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+
+	/* compute derivatives of P w.r.t. uv */
+	*dPdu = (p0 - p2);
+	*dPdv = (p1 - p2);
+}
+
+/* attributes */
+
+ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+{
+	if(elem == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+
+		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+	}
+	else if(elem == ATTR_ELEMENT_VERTEX) {
+		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+
+		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
+		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
+		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else if(elem == ATTR_ELEMENT_CORNER) {
+		int tri = offset + sd->prim*3;
+		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
+		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
+		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+
+		return 0.0f;
+	}
+}
+
+ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+{
+	if(elem == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+	}
+	else if(elem == ATTR_ELEMENT_VERTEX) {
+		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else if(elem == ATTR_ELEMENT_CORNER) {
+		int tri = offset + sd->prim*3;
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
deleted file mode 100644
index 942c7abce65..00000000000
--- a/intern/cycles/kernel/kernel_bvh.h
+++ /dev/null
@@ -1,1318 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * "Persistent while-while kernel" used in:
- *
- * "Understanding the Efficiency of Ray Traversal on GPUs",
- * Timo Aila and Samuli Laine,
- * Proc. High-Performance Graphics 2009
- */
-
-/* bottom-most stack entry, indicating the end of traversal */
-#define ENTRYPOINT_SENTINEL 0x76543210
-
-/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
-#define BVH_STACK_SIZE 192
-#define BVH_NODE_SIZE 4
-#define TRI_NODE_SIZE 3
-
-/* silly workaround for float extended precision that happens when compiling
- * without sse support on x86, it results in different results for float ops
- * that you would otherwise expect to compare correctly */
-#if !defined(__i386__) || defined(__SSE__)
-#define NO_EXTENDED_PRECISION
-#else
-#define NO_EXTENDED_PRECISION volatile
-#endif
-
-ccl_device_inline float3 bvh_inverse_direction(float3 dir)
-{
-	/* avoid divide by zero (ooeps = exp2f(-80.0f)) */
-	float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
-	float3 idir;
-
-	idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
-	idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
-	idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
-
-	return idir;
-}
-
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
-{
-	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-
-	*P = transform_point(&tfm, ray->P);
-
-	float3 dir = transform_direction(&tfm, ray->D);
-
-	float len;
-	dir = normalize_len(dir, &len);
-
-	*idir = bvh_inverse_direction(dir);
-
-	if(*t != FLT_MAX)
-		*t *= len;
-}
-
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
-{
-	if(*t != FLT_MAX) {
-		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-		*t *= len(transform_direction(&tfm, 1.0f/(*idir)));
-	}
-
-	*P = ray->P;
-	*idir = bvh_inverse_direction(ray->D);
-}
-
-#ifdef __OBJECT_MOTION__
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
-{
-	Transform itfm;
-	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
-
-	*P = transform_point(&itfm, ray->P);
-
-	float3 dir = transform_direction(&itfm, ray->D);
-
-	float len;
-	dir = normalize_len(dir, &len);
-
-	*idir = bvh_inverse_direction(dir);
-
-	if(*t != FLT_MAX)
-		*t *= len;
-}
-
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
-{
-	if(*t != FLT_MAX)
-		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
-
-	*P = ray->P;
-	*idir = bvh_inverse_direction(ray->D);
-}
-#endif
-
-/* Sven Woop's algorithm */
-ccl_device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int triAddr)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-	float3 dir = 1.0f/idir;
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < isect->t) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-#ifdef __VISIBILITY_FLAG__
-				/* visibility flag test. we do it here under the assumption
-				 * that most triangles are culled by node flags */
-				if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
-#endif
-				{
-					/* record intersection */
-					isect->prim = triAddr;
-					isect->object = object;
-					isect->u = u;
-					isect->v = v;
-					isect->t = t;
-					return true;
-				}
-			}
-		}
-	}
-
-	return false;
-}
-
-#ifdef __HAIR__
-ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3)
-{
-	float halfdiscroot = (p2 * p2 - 3 * p3 * p1);
-	float ta = -1.0f;
-	float tb = -1.0f;
-	*extremta = -1.0f;
-	*extremtb = -1.0f;
-	*upper = p0;
-	*lower = p0 + p1 + p2 + p3;
-	*extrema = *upper;
-	*extremb = *lower;
-	if(*lower >= *upper) {
-		*upper = *lower;
-		*lower = p0;
-	}
-
-	if(halfdiscroot >= 0) {
-		halfdiscroot = sqrt(halfdiscroot);
-		ta = (-p2 - halfdiscroot) / (3 * p3);
-		tb = (-p2 + halfdiscroot) / (3 * p3);
-	}
-
-	float t2;
-	float t3;
-	if(ta > 0.0f && ta < 1.0f) {
-		t2 = ta * ta;
-		t3 = t2 * ta;
-		*extremta = ta;
-		*extrema = p3 * t3 + p2 * t2 + p1 * ta + p0;
-		if(*extrema > *upper) {
-			*upper = *extrema;
-		}
-		if(*extrema < *lower) {
-			*lower = *extrema;
-		}
-	}
-	if(tb > 0.0f && tb < 1.0f) {
-		t2 = tb * tb;
-		t3 = t2 * tb;
-		*extremtb = tb;
-		*extremb = p3 * t3 + p2 * t2 + p1 * tb + p0;
-		if(*extremb >= *upper) {
-			*upper = *extremb;
-		}
-		if(*extremb <= *lower) {
-			*lower = *extremb;
-		}
-	}
-}
-
-#ifdef __KERNEL_SSE2__
-ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
-{
-	return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2])));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and idir by reference to aligned vector */
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
-#else
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
-#endif
-{
-	float epsilon = 0.0f;
-	float r_st, r_en;
-
-	int depth = kernel_data.curve.subdivisions;
-	int flags = kernel_data.curve.curveflags;
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
-	__m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
-	__m128 vcurve_coef[4];
-	const float3 *curve_coef = (float3 *)vcurve_coef;
-	
-	{
-		__m128 dtmp = _mm_mul_ps(vdir, vdir);
-		__m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp)));
-		__m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss);
-
-		__m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]);
-		int2 &v00 = (int2 &)v00vec;
-
-		int k0 = v00.x + segment;
-		int k1 = k0 + 1;
-		int ka = max(k0 - 1, v00.x);
-		int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-		__m128 P0 = _mm_load_ps(&kg->__curve_keys.data[ka].x);
-		__m128 P1 = _mm_load_ps(&kg->__curve_keys.data[k0].x);
-		__m128 P2 = _mm_load_ps(&kg->__curve_keys.data[k1].x);
-		__m128 P3 = _mm_load_ps(&kg->__curve_keys.data[kb].x);
-
-		__m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss));
-		__m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn);
-		__m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy);
-		__m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		__m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)));
-
-		__m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		__m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0);
-		__m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-		__m128 htfm[] = { htfm0, htfm1, htfm2 };
-		__m128 vP = load_m128(P);
-		__m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, vP));
-		__m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, vP));
-		__m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, vP));
-		__m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, vP));
-
-		float fc = 0.71f;
-		__m128 vfc = _mm_set1_ps(fc);
-		__m128 vfcxp3 = _mm_mul_ps(vfc, p3);
-
-		vcurve_coef[0] = p1;
-		vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0));
-		vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3));
-
-		r_st = ((float4 &)P1).w;
-		r_en = ((float4 &)P2).w;
-	}
-#else
-	float3 curve_coef[4];
-
-	/* curve Intersection check */
-	float3 dir = 1.0f/idir;
-
-	/* obtain curve parameters */
-	{
-		/* ray transform created - this should be created at beginning of intersection loop */
-		Transform htfm;
-		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-		htfm = make_transform(
-			dir.z / d, 0, -dir.x /d, 0,
-			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
-			dir.x, dir.y, dir.z, 0,
-			0, 0, 0, 1);
-
-		float4 v00 = kernel_tex_fetch(__curves, prim);
-
-		int k0 = __float_as_int(v00.x) + segment;
-		int k1 = k0 + 1;
-
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P0 = kernel_tex_fetch(__curve_keys, ka);
-		float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-		float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-		float4 P3 = kernel_tex_fetch(__curve_keys, kb);
-
-		float3 p0 = transform_point(&htfm, float4_to_float3(P0) - P);
-		float3 p1 = transform_point(&htfm, float4_to_float3(P1) - P);
-		float3 p2 = transform_point(&htfm, float4_to_float3(P2) - P);
-		float3 p3 = transform_point(&htfm, float4_to_float3(P3) - P);
-
-		float fc = 0.71f;
-		curve_coef[0] = p1;
-		curve_coef[1] = -fc*p0 + fc*p2;
-		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-		r_st = P1.w;
-		r_en = P2.w;
-	}
-#endif
-
-	float r_curr = max(r_st, r_en);
-
-	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-		epsilon = 2 * r_curr;
-
-	/* find bounds - this is slow for cubic curves */
-	float upper, lower;
-
-	float zextrem[4];
-	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
-	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return false;
-
-	/* minimum width extension */
-	float mw_extension = min(difl * fabsf(upper), extmax);
-	float r_ext = mw_extension + r_curr;
-
-	float xextrem[4];
-	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	float yextrem[4];
-	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	/* setup recurrent loop */
-	int level = 1 << depth;
-	int tree = 0;
-	float resol = 1.0f / (float)level;
-	bool hit = false;
-
-	/* begin loop */
-	while(!(tree >> (depth))) {
-		float i_st = tree * resol;
-		float i_en = i_st + (level * resol);
-#ifdef __KERNEL_SSE2__
-		__m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en);
-		__m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		__m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
-		__m128 vbmin = _mm_min_ps(vp_st, vp_en);
-		__m128 vbmax = _mm_max_ps(vp_st, vp_en);
-
-		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
-		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
-		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-		
-		float bminx = min(p_st.x, p_en.x);
-		float bmaxx = max(p_st.x, p_en.x);
-		float bminy = min(p_st.y, p_en.y);
-		float bmaxy = max(p_st.y, p_en.y);
-		float bminz = min(p_st.z, p_en.z);
-		float bmaxz = max(p_st.z, p_en.z);
-#endif
-
-		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
-			bminx = min(bminx,xextrem[1]);
-			bmaxx = max(bmaxx,xextrem[1]);
-		}
-		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
-			bminx = min(bminx,xextrem[3]);
-			bmaxx = max(bmaxx,xextrem[3]);
-		}
-		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
-			bminy = min(bminy,yextrem[1]);
-			bmaxy = max(bmaxy,yextrem[1]);
-		}
-		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
-			bminy = min(bminy,yextrem[3]);
-			bmaxy = max(bmaxy,yextrem[3]);
-		}
-		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
-			bminz = min(bminz,zextrem[1]);
-			bmaxz = max(bmaxz,zextrem[1]);
-		}
-		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
-			bminz = min(bminz,zextrem[3]);
-			bmaxz = max(bmaxz,zextrem[3]);
-		}
-
-		float r1 = r_st + (r_en - r_st) * i_st;
-		float r2 = r_st + (r_en - r_st) * i_en;
-		r_curr = max(r1, r2);
-
-		mw_extension = min(difl * fabsf(bmaxz), extmax);
-		float r_ext = mw_extension + r_curr;
-		float coverage = 1.0f;
-
-		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
-			/* the bounding box does not overlap the square centered at O */
-			tree += level;
-			level = tree & -tree;
-		}
-		else if (level == 1) {
-
-			/* the maximum recursion depth is reached.
-			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			* dP* is reversed if necessary.*/
-			float t = isect->t;
-			float u = 0.0f;
-			if(flags & CURVE_KN_RIBBONS) {
-				float3 tg = (p_en - p_st);
-				float w = tg.x * tg.x + tg.y * tg.y;
-				if (w == 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-				w = clamp((float)w, 0.0f, 1.0f);
-
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r_st + (r_en - r_st) * u;
-				/* compare x-y distances */
-				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
-					dp_en *= -1;
-				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				/* compute coverage */
-				float r_ext = r_curr;
-				coverage = 1.0f;
-				if(difl != 0.0f) {
-					mw_extension = min(difl * fabsf(bmaxz), extmax);
-					r_ext = mw_extension + r_curr;
-					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-					float d0 = d - r_curr;
-					float d1 = d + r_curr;
-					if (d0 >= 0)
-						coverage = (min(d1 / mw_extension, 1.0f) - min(d0 / mw_extension, 1.0f)) * 0.5f;
-					else // inside
-						coverage = (min(d1 / mw_extension, 1.0f) + min(-d0 / mw_extension, 1.0f)) * 0.5f;
-				}
-				
-				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				t = p_curr.z;
-			}
-			else {
-				float l = len(p_en - p_st);
-				/* minimum width extension */
-				float or1 = r1;
-				float or2 = r2;
-				if(difl != 0.0f) {
-					mw_extension = min(len(p_st - P) * difl, extmax);
-					or1 = r1 < mw_extension ? mw_extension : r1;
-					mw_extension = min(len(p_en - P) * difl, extmax);
-					or2 = r2 < mw_extension ? mw_extension : r2;
-				}
-				/* --- */
-				float3 tg = (p_en - p_st) / l;
-				float gd = (or2 - or1) / l;
-				float difz = -dot(p_st,tg);
-				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
-				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
-				float tcentre = -halfb/cyla;
-				float zcentre = difz + (tg.z * tcentre);
-				float3 tdif = - p_st;
-				tdif.z += tcentre;
-				float tdifz = dot(tdif,tg);
-				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
-				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
-				float td = tb*tb - 4*cyla*tc;
-				if (td < 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				
-				float rootd = sqrtf(td);
-				float correction = ((-tb - rootd)/(2*cyla));
-				t = tcentre + correction;
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
-					dp_en *= -1;
-
-				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
-					correction = ((-tb + rootd)/(2*cyla));
-					t = tcentre + correction;
-				}			
-
-				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float w = (zcentre + (tg.z * correction))/l;
-				w = clamp((float)w, 0.0f, 1.0f);
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r1 + (r2 - r1) * w;
-				r_ext = or1 + (or2 - or1) * w;
-				coverage = r_curr/r_ext;
-
-			}
-			/* we found a new intersection */
-
-			/* stochastic fade from minimum width */
-			if(lcg_state && coverage != 1.0f) {
-				if(lcg_step_float(lcg_state) > coverage)
-					return hit;
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->prim = curveAddr;
-				isect->segment = segment;
-				isect->object = object;
-				isect->u = u;
-				isect->v = 0.0f;
-				/*isect->v = 1.0f - coverage; */
-				isect->t = t;
-				hit = true;
-			}
-			
-			tree++;
-			level = tree & -tree;
-		}
-		else {
-			/* split the curve into two curves and process */
-			level = level >> 1;
-		}
-	}
-
-	return hit;
-}
-
-ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
-{
-	/* define few macros to minimize code duplication for SSE */
-#ifndef __KERNEL_SSE2__
-#define len3_squared(x) len_squared(x)
-#define len3(x) len(x)
-#define dot3(x, y) dot(x, y)
-#endif
-
-	/* curve Intersection check */
-	int flags = kernel_data.curve.curveflags;
-
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int cnum = __float_as_int(v00.x);
-	int k0 = cnum + segment;
-	int k1 = k0 + 1;
-
-#ifndef __KERNEL_SSE2__
-	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-
-	float or1 = P1.w;
-	float or2 = P2.w;
-	float3 p1 = float4_to_float3(P1);
-	float3 p2 = float4_to_float3(P2);
-
-	/* minimum width extension */
-	float r1 = or1;
-	float r2 = or2;
-	float3 dif = P - p1;
-	float3 dif_second = P - p2;
-	if(difl != 0.0f) {
-		float pixelsize = min(len3(dif) * difl, extmax);
-		r1 = or1 < pixelsize ? pixelsize : or1;
-		pixelsize = min(len3(dif_second) * difl, extmax);
-		r2 = or2 < pixelsize ? pixelsize : or2;
-	}
-	/* --- */
-
-	float3 dir = 1.0f / idir;
-	float3 p21_diff = p2 - p1;
-	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-	float sphere_b_tmp = dot3(dir, sphere_dif1);
-	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#else
-	const __m128 p1 = _mm_load_ps(&kg->__curve_keys.data[k0].x);
-	const __m128 p2 = _mm_load_ps(&kg->__curve_keys.data[k1].x);
-	const __m128 or12 = shuffle<3, 3, 3, 3>(p1, p2);
-
-	__m128 r12 = or12;
-	const __m128 vP = load_m128(P);
-	const __m128 dif = _mm_sub_ps(vP, p1);
-	const __m128 dif_second = _mm_sub_ps(vP, p2);
-	if(difl != 0.0f) {
-		const __m128 len1_sq = len3_squared_splat(dif);
-		const __m128 len2_sq = len3_squared_splat(dif_second);
-		const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-		const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax));
-		r12 = _mm_max_ps(or12, pixelsize12);
-	}
-	float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
-	float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
-
-	const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
-	const __m128 p21_diff = _mm_sub_ps(p2, p1);
-	const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
-	const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
-#endif
-
-	float mr = max(r1, r2);
-	float l = len3(p21_diff);
-	float invl = 1.0f / l;
-	float sp_r = mr + 0.5f * l;
-
-	float sphere_b = dot3(dir, sphere_dif2);
-	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-	if(sdisc < 0.0f)
-		return false;
-
-	/* obtain parameters and test midpoint distance for suitable modes */
-#ifndef __KERNEL_SSE2__
-	float3 tg = p21_diff * invl;
-#else
-	const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl));
-#endif
-	float gd = (r2 - r1) * invl;
-
-	float dirz = dot3(dir, tg);
-	float difz = dot3(dif, tg);
-
-	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-
-	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
-
-	float tcentre = -halfb/a;
-	float zcentre = difz + (dirz * tcentre);
-
-	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return false;
-	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return false;
-
-	/* test minimum separation */
-#ifndef __KERNEL_SSE2__
-	float3 cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross(tg, dif));
-#else
-	const __m128 cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross_zxy(tg, dif));
-#endif
-	float cprodsq = len3_squared(cprod);
-	float distscaled = dot3(cprod, dif);
-
-	if(cprodsq == 0)
-		distscaled = cprod2sq;
-	else
-		distscaled = (distscaled*distscaled)/cprodsq;
-
-	if(distscaled > mr*mr)
-		return false;
-
-	/* calculate true intersection */
-#ifndef __KERNEL_SSE2__
-	float3 tdif = dif + tcentre * dir;
-#else
-	const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif);
-#endif
-	float tdifz = dot3(tdif, tg);
-	float tdifma = tdifz*gd + r1;
-	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
-	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
-	float td = tb*tb - 4*a*tc;
-
-	if (td < 0.0f)
-		return false;
-
-	float rootd = 0.0f;
-	float correction = 0.0f;
-	if(flags & CURVE_KN_ACCURATE) {
-		rootd = sqrtf(td);
-		correction = ((-tb - rootd)/(2*a));
-	}
-
-	float t = tcentre + correction;
-
-	if(t < isect->t) {
-
-		if(flags & CURVE_KN_INTERSECTCORRECTION) {
-			rootd = sqrtf(td);
-			correction = ((-tb - rootd)/(2*a));
-			t = tcentre + correction;
-		}
-
-		float z = zcentre + (dirz * correction);
-		bool backface = false;
-
-		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-			backface = true;
-			correction = ((-tb + rootd)/(2*a));
-			t = tcentre + correction;
-			z = zcentre + (dirz * correction);
-		}
-
-		/* stochastic fade from minimum width */
-		float adjradius = or1 + z * (or2 - or1) * invl;
-		adjradius = adjradius / (r1 + z * gd);
-		if(lcg_state && adjradius != 1.0f) {
-			if(lcg_step_float(lcg_state) > adjradius)
-				return false;
-		}
-		/* --- */
-
-		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-			if (flags & CURVE_KN_ENCLOSEFILTER) {
-				float enc_ratio = 1.01f;
-				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
-					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
-					if(a2*c2 < 0.0f)
-						return false;
-				}
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->prim = curveAddr;
-				isect->segment = segment;
-				isect->object = object;
-				isect->u = z*invl;
-				isect->v = td/(4*a*a);
-				/*isect->v = 1.0f - adjradius;*/
-				isect->t = t;
-
-				if(backface) 
-					isect->u = -isect->u;
-				
-				return true;
-			}
-		}
-	}
-
-	return false;
-
-#ifndef __KERNEL_SSE2__
-#undef len3_squared
-#undef len3
-#undef dot3
-#endif
-}
-#endif
-
-#ifdef __SUBSURFACE__
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point. */
-
-ccl_device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
-	float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-	float3 dir = 1.0f/idir;
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < tmax) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-				(*num_hits)++;
-
-				int hit;
-
-				if(*num_hits <= max_hits) {
-					hit = *num_hits - 1;
-				}
-				else {
-					/* reservoir sampling: if we are at the maximum number of
-					 * hits, randomly replace element or skip it */
-					hit = lcg_step_uint(lcg_state) % *num_hits;
-
-					if(hit >= max_hits)
-						return;
-				}
-
-				/* record intersection */
-				Intersection *isect = &isect_array[hit];
-				isect->prim = triAddr;
-				isect->object = object;
-				isect->u = u;
-				isect->v = v;
-				isect->t = t;
-			}
-		}
-	}
-}
-#endif
-
-/* BVH intersection function variations */
-
-#define BVH_INSTANCING			1
-#define BVH_MOTION				2
-#define BVH_HAIR				4
-#define BVH_HAIR_MINIMUM_WIDTH	8
-
-#define BVH_FUNCTION_NAME bvh_intersect
-#define BVH_FUNCTION_FEATURES 0
-#include "kernel_bvh_traversal.h"
-
-#if defined(__INSTANCING__)
-#define BVH_FUNCTION_NAME bvh_intersect_instancing
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__HAIR__)
-#define BVH_FUNCTION_NAME bvh_intersect_hair
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_hair_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__SUBSURFACE__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface
-#define BVH_FUNCTION_FEATURES 0
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_instancing
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__HAIR__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#include "kernel_bvh_subsurface.h"
-#endif
-
-/* to work around titan bug when using arrays instead of textures */
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-#ifdef __HAIR__ 
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
-#else
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
-#endif
-{
-#ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-#ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
-
-		return bvh_intersect_motion(kg, ray, isect, visibility);
-	}
-#endif /* __OBJECT_MOTION__ */
-
-#ifdef __HAIR__ 
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_CPU__
-
-#ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_instancing(kg, ray, isect, visibility);
-#endif /* __INSTANCING__ */
-
-	return bvh_intersect(kg, ray, isect, visibility);
-#else /* __KERNEL_CPU__ */
-
-#ifdef __INSTANCING__
-	return bvh_intersect_instancing(kg, ray, isect, visibility);
-#else
-	return bvh_intersect(kg, ray, isect, visibility);
-#endif /* __INSTANCING__ */
-
-#endif /* __KERNEL_CPU__ */
-}
-
-/* to work around titan bug when using arrays instead of textures */
-#ifdef __SUBSURFACE__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
-{
-#ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-#ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __HAIR__ */
-
-		return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-	}
-#endif /* __OBJECT_MOTION__ */
-
-#ifdef __HAIR__ 
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_CPU__
-
-#ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __INSTANCING__ */
-
-	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#else /* __KERNEL_CPU__ */
-
-#ifdef __INSTANCING__
-	return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#else
-	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __INSTANCING__ */
-
-#endif /* __KERNEL_CPU__ */
-}
-#endif
-
-/* Ray offset to avoid self intersection */
-
-ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
-{
-#ifdef __INTERSECTION_REFINE__
-	const float epsilon_f = 1e-5f;
-	/* ideally this should match epsilon_f, but instancing/mblur
-	 * precision makes it problematic */
-	const float epsilon_test = 1.0f;
-	const int epsilon_i = 32;
-
-	float3 res;
-
-	/* x component */
-	if(fabsf(P.x) < epsilon_test) {
-		res.x = P.x + Ng.x*epsilon_f;
-	}
-	else {
-		uint ix = __float_as_uint(P.x);
-		ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
-		res.x = __uint_as_float(ix);
-	}
-
-	/* y component */
-	if(fabsf(P.y) < epsilon_test) {
-		res.y = P.y + Ng.y*epsilon_f;
-	}
-	else {
-		uint iy = __float_as_uint(P.y);
-		iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
-		res.y = __uint_as_float(iy);
-	}
-
-	/* z component */
-	if(fabsf(P.z) < epsilon_test) {
-		res.z = P.z + Ng.z*epsilon_f;
-	}
-	else {
-		uint iz = __float_as_uint(P.z);
-		iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
-		res.z = __uint_as_float(iz);
-	}
-
-	return res;
-#else
-	const float epsilon_f = 1e-4f;
-	return P + epsilon_f*Ng;
-#endif
-}
-
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-ccl_device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* same as above, except that isect->t is assumed to be in object space for instancing */
-ccl_device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-#ifdef __HAIR__
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float fc = 0.71f;
-	float data[4];
-	float t2 = t * t;
-	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
-	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
-	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
-	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float data[4];
-	float fc = 0.71f;
-	float t2 = t * t;
-	float t3 = t2 * t;
-	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
-	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
-	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
-	data[3] =  fc          * t3  - fc * t2;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	int flag = kernel_data.curve.curveflags;
-	float t = isect->t;
-	float3 P = ray->P;
-	float3 D = ray->D;
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	int prim = kernel_tex_fetch(__prim_index, isect->prim);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + isect->segment;
-	int k1 = k0 + 1;
-
-	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-	float l = 1.0f;
-	float3 tg = normalize_len(float4_to_float3(P2 - P1), &l);
-	float r1 = P1.w;
-	float r2 = P2.w;
-	float gd = ((r2 - r1)/l);
-	
-	P = P + D*t;
-
-	if(flag & CURVE_KN_INTERPOLATE) {
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P0 = kernel_tex_fetch(__curve_keys, ka);
-		float4 P3 = kernel_tex_fetch(__curve_keys, kb);
-
-		float3 p[4];
-		p[0] = float4_to_float3(P0);
-		p[1] = float4_to_float3(P1);
-		p[2] = float4_to_float3(P2);
-		p[3] = float4_to_float3(P3);
-
-#ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
-#endif
-	
-		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
-		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
-		else {
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
-			sd->Ng = sd->Ng - gd * tg;
-			sd->Ng = normalize(sd->Ng);
-		}
-		sd->N = sd->Ng;
-	}
-	else {
-		float3 dif = P - float4_to_float3(P1);
-
-#ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
-#endif
-
-		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
-		}
-		else {
-			sd->Ng = (dif - tg * sd->u * l) / (P1.w + sd->u * l * gd);
-			if (gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg ;
-				sd->Ng = normalize(sd->Ng);
-			}
-		}
-
-		sd->N = sd->Ng;
-	}
-
-#ifdef __DPDU__
-	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
-#endif
-
-	/*add fading parameter for minimum pixel width with transparency bsdf*/
-	/*sd->curve_transparency = isect->v;*/
-	/*sd->curve_radius = sd->u * gd * l + r1;*/
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-}
-#endif
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h
deleted file mode 100644
index 40683a2da57..00000000000
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2013, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array,
-	int subsurface_object, uint *lcg_state, int max_hits)
-{
-	/* todo:
-	 * - test if pushing distance on the stack helps (for non shadow rays)
-	 * - separate version for shadow rays
-	 * - likely and unlikely for if() statements
-	 * - SSE for hair
-	 * - test restrict attribute for pointers
-	 */
-	
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* ray parameters in registers */
-	const float tmax = ray->t;
-	float3 P = ray->P;
-	float3 idir = bvh_inverse_direction(ray->D);
-	int object = ~0;
-	float isect_t = tmax;
-
-	const uint visibility = ~0;
-	uint num_hits = 0;
-
-#if FEATURE(BVH_MOTION)
-	Transform ob_tfm;
-#endif
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
-
-	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-	/* traversal loop */
-	do {
-		do
-		{
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
-			{
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
-
-#if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-#endif
-
-#else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
-
-				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
-
-				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-#else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
-#endif
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
-
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
-#endif
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-
-					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
-
-					/* primitive intersection */
-					for(; primAddr < primAddr2; primAddr++) {
-#if FEATURE(BVH_HAIR)
-						uint segment = kernel_tex_fetch(__prim_segment, primAddr);
-						if(segment != ~0)
-							continue;
-#endif
-
-						/* only primitives from the same object */
-						uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object;
-
-						if(tri_object == subsurface_object) {
-
-							/* intersect ray against primitive */
-							bvh_triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
-						}
-					}
-				}
-#if FEATURE(BVH_INSTANCING)
-				else {
-					/* instance push */
-					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
-						object = subsurface_object;
-
-#if FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax);
-#else
-						bvh_instance_push(kg, object, ray, &P, &idir, &isect_t, tmax);
-#endif
-
-#if defined(__KERNEL_SSE2__)
-						Psplat[0] = _mm_set_ps1(P.x);
-						Psplat[1] = _mm_set_ps1(P.y);
-						Psplat[2] = _mm_set_ps1(P.z);
-
-						tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
-
-						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-						++stackPtr;
-						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
-
-						nodeAddr = kernel_tex_fetch(__object_node, object);
-					}
-					else {
-						/* pop */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-#endif
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != ~0);
-
-			/* instance pop */
-#if FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax);
-#else
-			bvh_instance_pop(kg, object, ray, &P, &idir, &isect_t, tmax);
-#endif
-
-#if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
-
-			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
-
-			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-			object = ~0;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
-		}
-#endif
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return num_hits;
-}
-
-#undef FEATURE
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
deleted file mode 100644
index 0515a9e0fa7..00000000000
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2013, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
- * BVH_MOTION: motion blur rendering
- *
- */
-
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device bool BVH_FUNCTION_NAME
-(KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-, uint *lcg_state, float difl, float extmax
-#endif
-)
-{
-	/* todo:
-	 * - test if pushing distance on the stack helps (for non shadow rays)
-	 * - separate version for shadow rays
-	 * - likely and unlikely for if() statements
-	 * - SSE for hair
-	 * - test restrict attribute for pointers
-	 */
-	
-	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
-
-	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-
-	/* ray parameters in registers */
-	const float tmax = ray->t;
-	float3 P = ray->P;
-	float3 idir = bvh_inverse_direction(ray->D);
-	int object = ~0;
-
-#if FEATURE(BVH_MOTION)
-	Transform ob_tfm;
-#endif
-
-	isect->t = tmax;
-	isect->object = ~0;
-	isect->prim = ~0;
-	isect->u = 0.0f;
-	isect->v = 0.0f;
-
-#if defined(__KERNEL_SSE2__)
-	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
-	shuffle_swap_t shufflexyz[3];
-
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
-
-	__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
-
-	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-	/* traversal loop */
-	do {
-		do
-		{
-			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
-			{
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
-
-#if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-				if(difl != 0.0f) {
-					float hdiff = 1.0f + difl;
-					float ldiff = 1.0f - difl;
-					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-						c0min = max(ldiff * c0min, c0min - extmax);
-						c0max = min(hdiff * c0max, c0max + extmax);
-					}
-					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-						c1min = max(ldiff * c1min, c1min - extmax);
-						c1max = min(hdiff * c1max, c1max + extmax);
-					}
-				}
-#endif
-
-				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-#endif
-
-#else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
-				const __m128 tminmax = _mm_xor_ps(minmax, pn);
-
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-				if(difl != 0.0f) {
-					float4 *tminmaxview = (float4*)&tminmax;
-					float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
-					float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
-
-					float hdiff = 1.0f + difl;
-					float ldiff = 1.0f - difl;
-					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-						c0min = max(ldiff * c0min, c0min - extmax);
-						c0max = min(hdiff * c0max, c0max + extmax);
-					}
-					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-						c1min = max(ldiff * c1min, c1min - extmax);
-						c1max = min(hdiff * c1max, c1max + extmax);
-					}
-				}
-#endif
-
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
-
-				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-#else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
-#endif
-#endif // __KERNEL_SSE2__
-
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
-
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
-#endif
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
-					}
-
-					++stackPtr;
-					traversalStack[stackPtr] = nodeAddrChild1;
-				}
-				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
-					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
-					}
-				}
-			}
-
-			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
-				int primAddr = __float_as_int(leaf.x);
-
-#if FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-
-					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
-
-					/* primitive intersection */
-					while(primAddr < primAddr2) {
-						bool hit;
-
-						/* intersect ray against primitive */
-#if FEATURE(BVH_HAIR)
-						uint segment = kernel_tex_fetch(__prim_segment, primAddr);
-						if(segment != ~0) {
-
-							if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-								hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
-							else
-								hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
-#else
-								hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
-							else
-								hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
-#endif
-						}
-						else
-#endif
-							hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
-
-						/* shadow ray early termination */
-#if defined(__KERNEL_SSE2__)
-						if(hit) {
-							if(visibility == PATH_RAY_SHADOW_OPAQUE)
-								return true;
-
-							tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
-						}
-#else
-						if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
-							return true;
-#endif
-
-						primAddr++;
-					}
-				}
-#if FEATURE(BVH_INSTANCING)
-				else {
-					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-
-#if FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax);
-#else
-					bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
-#endif
-
-#if defined(__KERNEL_SSE2__)
-					Psplat[0] = _mm_set_ps1(P.x);
-					Psplat[1] = _mm_set_ps1(P.y);
-					Psplat[2] = _mm_set_ps1(P.z);
-
-					tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
-
-					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-					++stackPtr;
-					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
-
-					nodeAddr = kernel_tex_fetch(__object_node, object);
-				}
-			}
-#endif
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != ~0);
-
-			/* instance pop */
-#if FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax);
-#else
-			bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
-#endif
-
-#if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
-
-			tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
-
-			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
-			object = ~0;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
-		}
-#endif
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return (isect->prim != ~0);
-}
-
-#undef FEATURE
-#undef BVH_FUNCTION_NAME
-#undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/kernel_curve.h b/intern/cycles/kernel/kernel_curve.h
deleted file mode 100644
index 821ac50eaa9..00000000000
--- a/intern/cycles/kernel/kernel_curve.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __HAIR__
-
-/* curve attributes */
-
-ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
-{
-	if(elem == ATTR_ELEMENT_CURVE) {
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-#endif
-
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
-	}
-	else if(elem == ATTR_ELEMENT_CURVE_KEY) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + sd->segment;
-		int k1 = k0 + 1;
-
-		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
-		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
-		if(dy) *dy = 0.0f;
-#endif
-
-		return (1.0f - sd->u)*f0 + sd->u*f1;
-	}
-	else {
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-#endif
-
-		return 0.0f;
-	}
-}
-
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
-{
-	if(elem == ATTR_ELEMENT_CURVE) {
-		/* idea: we can't derive any useful differentials here, but for tiled
-		 * mipmap image caching it would be useful to avoid reading the highest
-		 * detail level always. maybe a derivative based on the hair density
-		 * could be computed somehow? */
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
-	}
-	else if(elem == ATTR_ELEMENT_CURVE_KEY) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + sd->segment;
-		int k1 = k0 + 1;
-
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-		return (1.0f - sd->u)*f0 + sd->u*f1;
-	}
-	else {
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
-}
-
-/* hair info node functions */
-
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
-{
-	float r = 0.0f;
-
-	if(sd->segment != ~0) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + sd->segment;
-		int k1 = k0 + 1;
-
-		float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-		float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-		r = (P2.w - P1.w) * sd->u + P1.w;
-	}
-
-	return r*2.0f;
-}
-
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{	
-	float3 tgN = make_float3(0.0f,0.0f,0.0f);
-
-	if(sd->segment != ~0) {
-
-		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
-		tgN = normalize(tgN);
-
-		/* need to find suitable scaled gd for corrected normal */
-#if 0
-		tgN = normalize(tgN - gd * sd->dPdu);
-#endif
-	}
-
-	return tgN;
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_object.h b/intern/cycles/kernel/kernel_object.h
deleted file mode 100644
index a66277e10cd..00000000000
--- a/intern/cycles/kernel/kernel_object.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-enum ObjectTransform {
-	OBJECT_TRANSFORM = 0,
-	OBJECT_TRANSFORM_MOTION_PRE = 0,
-	OBJECT_INVERSE_TRANSFORM = 4,
-	OBJECT_TRANSFORM_MOTION_POST = 4,
-	OBJECT_PROPERTIES = 8,
-	OBJECT_DUPLI = 9
-};
-
-enum ObjectVectorTransform {
-	OBJECT_VECTOR_MOTION_PRE = 0,
-	OBJECT_VECTOR_MOTION_POST = 3
-};
-
-ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type)
-{
-	int offset = object*OBJECT_SIZE + (int)type;
-
-	Transform tfm;
-	tfm.x = kernel_tex_fetch(__objects, offset + 0);
-	tfm.y = kernel_tex_fetch(__objects, offset + 1);
-	tfm.z = kernel_tex_fetch(__objects, offset + 2);
-	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-
-	return tfm;
-}
-
-ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
-{
-	int offset = object*OBJECT_VECTOR_SIZE + (int)type;
-
-	Transform tfm;
-	tfm.x = kernel_tex_fetch(__objects_vector, offset + 0);
-	tfm.y = kernel_tex_fetch(__objects_vector, offset + 1);
-	tfm.z = kernel_tex_fetch(__objects_vector, offset + 2);
-	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-
-	return tfm;
-}
-
-#ifdef __OBJECT_MOTION__
-ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time)
-{
-	DecompMotionTransform motion;
-
-	int offset = object*OBJECT_SIZE + (int)OBJECT_TRANSFORM_MOTION_PRE;
-
-	motion.mid.x = kernel_tex_fetch(__objects, offset + 0);
-	motion.mid.y = kernel_tex_fetch(__objects, offset + 1);
-	motion.mid.z = kernel_tex_fetch(__objects, offset + 2);
-	motion.mid.w = kernel_tex_fetch(__objects, offset + 3);
-
-	motion.pre_x = kernel_tex_fetch(__objects, offset + 4);
-	motion.pre_y = kernel_tex_fetch(__objects, offset + 5);
-	motion.post_x = kernel_tex_fetch(__objects, offset + 6);
-	motion.post_y = kernel_tex_fetch(__objects, offset + 7);
-
-	Transform tfm;
-	transform_motion_interpolate(&tfm, &motion, time);
-
-	return tfm;
-}
-
-ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm)
-{
-	int object_flag = kernel_tex_fetch(__object_flag, object);
-
-	if(object_flag & SD_OBJECT_MOTION) {
-		/* if we do motion blur */
-		Transform tfm = object_fetch_transform_motion(kg, object, time);
-
-		if(itfm)
-			*itfm = transform_quick_inverse(tfm);
-
-		return tfm;
-	}
-	else {
-		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-		if(itfm)
-			*itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-
-		return tfm;
-	}
-}
-#endif
-
-ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
-{
-#ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_tfm, *P);
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-	*P = transform_point(&tfm, *P);
-#endif
-}
-
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
-{
-#ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_itfm, *P);
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-	*P = transform_point(&tfm, *P);
-#endif
-}
-
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
-{
-#ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-	*N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
-}
-
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
-{
-#ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-	*N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
-}
-
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
-{
-#ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_tfm, *D);
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-	*D = transform_direction(&tfm, *D);
-#endif
-}
-
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
-{
-#ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_itfm, *D);
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-	*D = transform_direction(&tfm, *D);
-#endif
-}
-
-ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
-{
-	if(sd->object == ~0)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
-#ifdef __OBJECT_MOTION__
-	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
-#else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-#endif
-}
-
-ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
-{
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return f.x;
-}
-
-ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
-{
-	if(object == ~0)
-		return 0.0f;
-
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return f.y;
-}
-
-ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
-{
-	if(object == ~0)
-		return 0.0f;
-
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return f.z;
-}
-
-ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object)
-{
-	if(object == ~0)
-		return 0.0f;
-
-	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return __float_as_uint(f.w);
-}
-
-ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
-{
-	if(object == ~0)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
-	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
-	float4 f = kernel_tex_fetch(__objects, offset);
-	return make_float3(f.x, f.y, f.z);
-}
-
-ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
-{
-	if(object == ~0)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
-	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
-	float4 f = kernel_tex_fetch(__objects, offset + 1);
-	return make_float3(f.x, f.y, 0.0f);
-}
-
-
-ccl_device int shader_pass_id(KernelGlobals *kg, ShaderData *sd)
-{
-	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
-}
-
-ccl_device_inline float particle_index(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.x;
-}
-
-ccl_device float particle_age(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.y;
-}
-
-ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.z;
-}
-
-ccl_device float particle_size(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 0);
-	return f.w;
-}
-
-ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 1);
-	return f;
-}
-
-ccl_device float3 particle_location(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f = kernel_tex_fetch(__particles, offset + 2);
-	return make_float3(f.x, f.y, f.z);
-}
-
-ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f2 = kernel_tex_fetch(__particles, offset + 2);
-	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
-	return make_float3(f2.w, f3.x, f3.y);
-}
-
-ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
-{
-	int offset = particle*PARTICLE_SIZE;
-	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
-	float4 f4 = kernel_tex_fetch(__particles, offset + 4);
-	return make_float3(f3.z, f3.w, f4.x);
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 81b61a54a6a..9b3ddbb7557 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -18,16 +18,15 @@
 #include "osl_shader.h"
 #endif
 
+#include "kernel_random.h"
+
+#include "geom/geom_bvh.h"
+
 #include "kernel_differential.h"
 #include "kernel_montecarlo.h"
 #include "kernel_projection.h"
-#include "kernel_object.h"
-#include "kernel_triangle.h"
-#include "kernel_curve.h"
 #include "kernel_primitive.h"
 #include "kernel_projection.h"
-#include "kernel_random.h"
-#include "kernel_bvh.h"
 #include "kernel_accumulate.h"
 #include "kernel_camera.h"
 #include "kernel_shader.h"
diff --git a/intern/cycles/kernel/kernel_triangle.h b/intern/cycles/kernel/kernel_triangle.h
deleted file mode 100644
index 0455df85961..00000000000
--- a/intern/cycles/kernel/kernel_triangle.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Point on triangle for Moller-Trumbore triangles */
-ccl_device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v)
-{
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-
-	/* compute point */
-	float t = 1.0f - u - v;
-	return (u*v0 + v*v1 + t*v2);
-}
-
-/* Normal for Moller-Trumbore triangles */
-ccl_device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader)
-{
-#if 0
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-
-	/* compute normal */
-	return normalize(cross(v2 - v0, v1 - v0));
-#else
-	float4 Nm = kernel_tex_fetch(__tri_normal, tri_index);
-	*shader = __float_as_int(Nm.w);
-	return make_float3(Nm.x, Nm.y, Nm.z);
-#endif
-}
-
-/* Return 3 triangle vertex locations */
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3])
-{
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-}
-
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v)
-{
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
-	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
-	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
-
-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
-}
-
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri)
-{
-	/* fetch triangle vertex coordinates */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri));
-
-	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-
-	/* compute derivatives of P w.r.t. uv */
-	*dPdu = (p0 - p2);
-	*dPdv = (p1 - p2);
-}
-
-/* attributes */
-
-ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
-{
-	if(elem == ATTR_ELEMENT_FACE) {
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
-	}
-	else if(elem == ATTR_ELEMENT_VERTEX) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
-
-		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
-		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
-		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
-		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
-		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
-		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else {
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-
-		return 0.0f;
-	}
-}
-
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
-{
-	if(elem == ATTR_ELEMENT_FACE) {
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
-	}
-	else if(elem == ATTR_ELEMENT_VERTEX) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
-
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else {
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 58858c3766e..a25d2fe03b5 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -30,14 +30,13 @@
 
 #include "kernel_compat_cpu.h"
 #include "kernel_globals.h"
+#include "kernel_random.h"
+
+#include "geom/geom_bvh.h"
+
 #include "kernel_montecarlo.h"
 #include "kernel_projection.h"
 #include "kernel_differential.h"
-#include "kernel_object.h"
-#include "kernel_random.h"
-#include "kernel_bvh.h"
-#include "kernel_triangle.h"
-#include "kernel_curve.h"
 #include "kernel_primitive.h"
 #include "kernel_projection.h"
 #include "kernel_accumulate.h"
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 554f647df7c..34d9ebefdb3 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -18,7 +18,8 @@
 #include "kernel_montecarlo.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
-#include "kernel_object.h"
+
+#include "geom/geom_object.h"
 
 #include "closure/bsdf_diffuse.h"
 #include "closure/bssrdf.h"
-- 
cgit v1.2.3