62 files changed, 4721 insertions, 2231 deletions
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 94ed3dd4311..3c9c83fec42 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -383,7 +383,6 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "use_progressive_refine")
 
         subsub = sub.column(align=True)
-        subsub.enabled = not rd.use_border
         subsub.prop(rd, "use_save_buffers")
 
         col = split.column(align=True)
@@ -1599,89 +1598,62 @@ def draw_pause(self, context):
 
 
 def get_panels():
-    types = bpy.types
-    panels = [
-        "RENDER_PT_render",
-        "RENDER_PT_output",
-        "RENDER_PT_encoding",
-        "RENDER_PT_dimensions",
-        "RENDER_PT_stamp",
-        "RENDER_PT_freestyle",
-        "RENDERLAYER_PT_layers",
-        "RENDERLAYER_PT_freestyle",
-        "RENDERLAYER_PT_freestyle_lineset",
-        "RENDERLAYER_PT_freestyle_linestyle",
-        "SCENE_PT_scene",
-        "SCENE_PT_color_management",
-        "SCENE_PT_custom_props",
-        "SCENE_PT_audio",
-        "SCENE_PT_unit",
-        "SCENE_PT_keying_sets",
-        "SCENE_PT_keying_set_paths",
-        "SCENE_PT_physics",
-        "WORLD_PT_context_world",
-        "WORLD_PT_custom_props",
-        "DATA_PT_context_mesh",
-        "DATA_PT_context_camera",
-        "DATA_PT_context_lamp",
-        "DATA_PT_context_speaker",
-        "DATA_PT_normals",
-        "DATA_PT_texture_space",
-        "DATA_PT_curve_texture_space",
-        "DATA_PT_mball_texture_space",
-        "DATA_PT_vertex_groups",
-        "DATA_PT_shape_keys",
-        "DATA_PT_uv_texture",
-        "DATA_PT_vertex_colors",
-        "DATA_PT_camera",
-        "DATA_PT_camera_display",
-        "DATA_PT_camera_stereoscopy",
-        "DATA_PT_camera_safe_areas",
-        "DATA_PT_lens",
-        "DATA_PT_speaker",
-        "DATA_PT_distance",
-        "DATA_PT_cone",
-        "DATA_PT_customdata",
-        "DATA_PT_custom_props_mesh",
-        "DATA_PT_custom_props_camera",
-        "DATA_PT_custom_props_lamp",
-        "DATA_PT_custom_props_speaker",
-        "DATA_PT_custom_props_arm",
-        "DATA_PT_custom_props_curve",
-        "DATA_PT_custom_props_lattice",
-        "DATA_PT_custom_props_metaball",
-        "TEXTURE_PT_preview",
-        "TEXTURE_PT_custom_props",
-        "TEXTURE_PT_clouds",
-        "TEXTURE_PT_wood",
-        "TEXTURE_PT_marble",
-        "TEXTURE_PT_magic",
-        "TEXTURE_PT_blend",
-        "TEXTURE_PT_stucci",
-        "TEXTURE_PT_image",
-        "TEXTURE_PT_image_sampling",
-        "TEXTURE_PT_image_mapping",
-        "TEXTURE_PT_musgrave",
-        "TEXTURE_PT_voronoi",
-        "TEXTURE_PT_distortednoise",
-        "TEXTURE_PT_voxeldata",
-        "TEXTURE_PT_pointdensity",
-        "TEXTURE_PT_pointdensity_turbulence",
-        "TEXTURE_PT_mapping",
-        "TEXTURE_PT_ocean",
-        "TEXTURE_PT_influence",
-        "TEXTURE_PT_colors",
-        "SCENE_PT_rigid_body_world",
-        "SCENE_PT_rigid_body_cache",
-        "SCENE_PT_rigid_body_field_weights",
-        "MATERIAL_PT_custom_props",
-        "MATERIAL_PT_freestyle_line",
-        "BONE_PT_custom_props",
-        "OBJECT_PT_custom_props",
-        ]
-
-    return [getattr(types, p) for p in panels if hasattr(types, p)]
-
+    exclude_panels = {
+        'DATA_PT_area',
+        'DATA_PT_camera_dof',
+        'DATA_PT_falloff_curve',
+        'DATA_PT_lamp',
+        'DATA_PT_preview',
+        'DATA_PT_shadow',
+        'DATA_PT_spot',
+        'DATA_PT_sunsky',
+        'MATERIAL_PT_context_material',
+        'MATERIAL_PT_diffuse',
+        'MATERIAL_PT_flare',
+        'MATERIAL_PT_halo',
+        'MATERIAL_PT_mirror',
+        'MATERIAL_PT_options',
+        'MATERIAL_PT_pipeline',
+        'MATERIAL_PT_preview',
+        'MATERIAL_PT_shading',
+        'MATERIAL_PT_shadow',
+        'MATERIAL_PT_specular',
+        'MATERIAL_PT_sss',
+        'MATERIAL_PT_strand',
+        'MATERIAL_PT_transp',
+        'MATERIAL_PT_volume_density',
+        'MATERIAL_PT_volume_integration',
+        'MATERIAL_PT_volume_lighting',
+        'MATERIAL_PT_volume_options',
+        'MATERIAL_PT_volume_shading',
+        'MATERIAL_PT_volume_transp',
+        'RENDERLAYER_PT_layer_options',
+        'RENDERLAYER_PT_layer_passes',
+        'RENDERLAYER_PT_views',
+        'RENDER_PT_antialiasing',
+        'RENDER_PT_bake',
+        'RENDER_PT_motion_blur',
+        'RENDER_PT_performance',
+        'RENDER_PT_post_processing',
+        'RENDER_PT_shading',
+        'SCENE_PT_simplify',
+        'TEXTURE_PT_context_texture',
+        'WORLD_PT_ambient_occlusion',
+        'WORLD_PT_environment_lighting',
+        'WORLD_PT_gather',
+        'WORLD_PT_indirect_lighting',
+        'WORLD_PT_mist',
+        'WORLD_PT_preview',
+        'WORLD_PT_world'
+        }
+
+    panels = []
+    for panel in bpy.types.Panel.__subclasses__():
+        if hasattr(panel, 'COMPAT_ENGINES') and 'BLENDER_RENDER' in panel.COMPAT_ENGINES:
+            if panel.__name__ not in exclude_panels:
+                panels.append(panel)
+
+    return panels
 
 def register():
     bpy.types.RENDER_PT_render.append(draw_device)
@@ -1690,10 +1662,10 @@ def register():
     for panel in get_panels():
         panel.COMPAT_ENGINES.add('CYCLES')
 
-
 def unregister():
     bpy.types.RENDER_PT_render.remove(draw_device)
     bpy.types.VIEW3D_HT_header.remove(draw_pause)
 
     for panel in get_panels():
-        panel.COMPAT_ENGINES.remove('CYCLES')
+        if 'CYCLES' in panel.COMPAT_ENGINES:
+            panel.COMPAT_ENGINES.remove('CYCLES')
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 7ca23f23cb4..64559804ccb 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -440,7 +440,7 @@ static ShaderNode *add_node(Scene *scene,
 				glossy->distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
 				break;
 			case BL::ShaderNodeBsdfGlossy::distribution_ASHIKHMIN_SHIRLEY:
-				glossy->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
+				glossy->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
 				break;
 			case BL::ShaderNodeBsdfGlossy::distribution_MULTI_GGX:
 				glossy->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 5729fa6113d..92e48f0d87f 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -19,6 +19,7 @@ set(SRC
 	bvh_node.cpp
 	bvh_sort.cpp
 	bvh_split.cpp
+	bvh_unaligned.cpp
 )
 
 set(SRC_HEADERS
@@ -29,6 +30,7 @@ set(SRC_HEADERS
 	bvh_params.h
 	bvh_sort.h
 	bvh_split.h
+	bvh_unaligned.h
 )
 
 include_directories(${INC})
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index fa2b9ae7279..e92526ac1c4 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -24,6 +24,7 @@
 #include "bvh_build.h"
 #include "bvh_node.h"
 #include "bvh_params.h"
+#include "bvh_unaligned.h"
 
 #include "util_debug.h"
 #include "util_foreach.h"
@@ -121,7 +122,7 @@ void BVH::refit(Progress& progress)
 
 /* Triangles */
 
-void BVH::pack_triangle(int idx, float4 storage[3])
+void BVH::pack_triangle(int idx, float4 tri_verts[3])
 {
 	int tob = pack.prim_object[idx];
 	assert(tob >= 0 && tob < objects.size());
@@ -129,49 +130,58 @@ void BVH::pack_triangle(int idx, float4 storage[3])
 
 	int tidx = pack.prim_index[idx];
 	Mesh::Triangle t = mesh->get_triangle(tidx);
-	const float3* vpos = &mesh->verts[0];
+	const float3 *vpos = &mesh->verts[0];
 	float3 v0 = vpos[t.v[0]];
 	float3 v1 = vpos[t.v[1]];
 	float3 v2 = vpos[t.v[2]];
 
-	storage[0] = float3_to_float4(v0);
-	storage[1] = float3_to_float4(v1);
-	storage[2] = float3_to_float4(v2);
+	tri_verts[0] = float3_to_float4(v0);
+	tri_verts[1] = float3_to_float4(v1);
+	tri_verts[2] = float3_to_float4(v2);
 }
 
 void BVH::pack_primitives()
 {
-	int nsize = TRI_NODE_SIZE;
-	size_t tidx_size = pack.prim_index.size();
-
-	pack.tri_storage.clear();
-	pack.tri_storage.resize(tidx_size * nsize);
+	const size_t tidx_size = pack.prim_index.size();
+	size_t num_prim_triangles = 0;
+	/* Count number of triangles primitives in BVH. */
+	for(unsigned int i = 0; i < tidx_size; i++) {
+		if((pack.prim_index[i] != -1)) {
+			if ((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
+				++num_prim_triangles;
+			}
+		}
+	}
+	/* Reserve size for arrays. */
+	pack.prim_tri_index.clear();
+	pack.prim_tri_index.resize(tidx_size);
+	pack.prim_tri_verts.clear();
+	pack.prim_tri_verts.resize(num_prim_triangles * 3);
 	pack.prim_visibility.clear();
 	pack.prim_visibility.resize(tidx_size);
-
+	/* Fill in all the arrays. */
+	size_t prim_triangle_index = 0;
 	for(unsigned int i = 0; i < tidx_size; i++) {
 		if(pack.prim_index[i] != -1) {
-			float4 storage[3];
+			int tob = pack.prim_object[i];
+			Object *ob = objects[tob];
 
-			if(pack.prim_type[i] & PRIMITIVE_TRIANGLE) {
-				pack_triangle(i, storage);
+			if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
+				pack_triangle(i, (float4*)&pack.prim_tri_verts[3 * prim_triangle_index]);
+				pack.prim_tri_index[i] = 3 * prim_triangle_index;
+				++prim_triangle_index;
 			}
 			else {
-				/* Avoid use of uninitialized memory. */
-				memset(&storage, 0, sizeof(storage));
+				pack.prim_tri_index[i] = -1;
 			}
 
-			memcpy(&pack.tri_storage[i * nsize], storage, sizeof(float4)*3);
-
-			int tob = pack.prim_object[i];
-			Object *ob = objects[tob];
 			pack.prim_visibility[i] = ob->visibility;
 
 			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
 				pack.prim_visibility[i] |= PATH_RAY_CURVE;
 		}
 		else {
-			memset(&pack.tri_storage[i * nsize], 0, sizeof(float4)*3);
+			pack.prim_tri_index[i] = -1;
 			pack.prim_visibility[i] = 0;
 		}
 	}
@@ -183,13 +193,13 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 {
 	/* The BVH's for instances are built separately, but for traversal all
 	 * BVH's are stored in global arrays. This function merges them into the
-	 * top level BVH, adjusting indexes and offsets where appropriate. */
-	bool use_qbvh = params.use_qbvh;
-	size_t nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
-	size_t nsize_leaf = (use_qbvh)? BVH_QNODE_LEAF_SIZE: BVH_NODE_LEAF_SIZE;
+	 * top level BVH, adjusting indexes and offsets where appropriate.
+	 */
+	const bool use_qbvh = params.use_qbvh;
 
-	/* adjust primitive index to point to the triangle in the global array, for
-	 * meshes with transform applied and already in the top level BVH */
+	/* Adjust primitive index to point to the triangle in the global array, for
+	 * meshes with transform applied and already in the top level BVH.
+	 */
 	for(size_t i = 0; i < pack.prim_index.size(); i++)
 		if(pack.prim_index[i] != -1) {
 			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
@@ -208,10 +218,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
 	/* reserve */
 	size_t prim_index_size = pack.prim_index.size();
-	size_t tri_storage_size = pack.tri_storage.size();
+	size_t prim_tri_verts_size = pack.prim_tri_verts.size();
 
 	size_t pack_prim_index_offset = prim_index_size;
-	size_t pack_tri_storage_offset = tri_storage_size;
+	size_t pack_prim_tri_verts_offset = prim_tri_verts_size;
 	size_t pack_nodes_offset = nodes_size;
 	size_t pack_leaf_nodes_offset = leaf_nodes_size;
 	size_t object_offset = 0;
@@ -225,7 +235,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 		if(mesh->need_build_bvh()) {
 			if(mesh_map.find(mesh) == mesh_map.end()) {
 				prim_index_size += bvh->pack.prim_index.size();
-				tri_storage_size += bvh->pack.tri_storage.size();
+				prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
 				nodes_size += bvh->pack.nodes.size();
 				leaf_nodes_size += bvh->pack.leaf_nodes.size();
 
@@ -240,7 +250,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	pack.prim_type.resize(prim_index_size);
 	pack.prim_object.resize(prim_index_size);
 	pack.prim_visibility.resize(prim_index_size);
-	pack.tri_storage.resize(tri_storage_size);
+	pack.prim_tri_verts.resize(prim_tri_verts_size);
+	pack.prim_tri_index.resize(prim_index_size);
 	pack.nodes.resize(nodes_size);
 	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());
@@ -249,7 +260,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
 	uint *pack_prim_visibility = (pack.prim_visibility.size())? &pack.prim_visibility[0]: NULL;
-	float4 *pack_tri_storage = (pack.tri_storage.size())? &pack.tri_storage[0]: NULL;
+	float4 *pack_prim_tri_verts = (pack.prim_tri_verts.size())? &pack.prim_tri_verts[0]: NULL;
+	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
 	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
 
@@ -277,8 +289,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
 		BVH *bvh = mesh->bvh;
 
-		int noffset = nodes_offset/nsize;
-		int noffset_leaf = nodes_leaf_offset/nsize_leaf;
+		int noffset = nodes_offset;
+		int noffset_leaf = nodes_leaf_offset;
 		int mesh_tri_offset = mesh->tri_offset;
 		int mesh_curve_offset = mesh->curve_offset;
 
@@ -290,18 +302,24 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
 		mesh_map[mesh] = pack.object_node[object_offset-1];
 
-		/* merge primitive and object indexes */
+		/* merge primitive, object and triangle indexes */
 		if(bvh->pack.prim_index.size()) {
 			size_t bvh_prim_index_size = bvh->pack.prim_index.size();
 			int *bvh_prim_index = &bvh->pack.prim_index[0];
 			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
+			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
 
 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
-				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
+				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
 					pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
-				else
+					pack_prim_tri_index[pack_prim_index_offset] = -1;
+				}
+				else {
 					pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
+					pack_prim_tri_index[pack_prim_index_offset] =
+					        bvh_prim_tri_index[i] + pack_prim_tri_verts_offset;
+				}
 
 				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
@@ -310,50 +328,64 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			}
 		}
 
-		/* merge triangle intersection data */
-		if(bvh->pack.tri_storage.size()) {
-			memcpy(pack_tri_storage + pack_tri_storage_offset,
-			       &bvh->pack.tri_storage[0],
-			       bvh->pack.tri_storage.size()*sizeof(float4));
-			pack_tri_storage_offset += bvh->pack.tri_storage.size();
+		/* Merge triangle vertices data. */
+		if(bvh->pack.prim_tri_verts.size()) {
+			const size_t prim_tri_size = bvh->pack.prim_tri_verts.size();
+			memcpy(pack_prim_tri_verts + pack_prim_tri_verts_offset,
+			       &bvh->pack.prim_tri_verts[0],
+			       prim_tri_size*sizeof(float4));
+			pack_prim_tri_verts_offset += prim_tri_size;
 		}
 
 		/* merge nodes */
 		if(bvh->pack.leaf_nodes.size()) {
 			int4 *leaf_nodes_offset = &bvh->pack.leaf_nodes[0];
 			size_t leaf_nodes_offset_size = bvh->pack.leaf_nodes.size();
-			for(size_t i = 0, j = 0; i < leaf_nodes_offset_size; i+=nsize_leaf, j++) {
+			for(size_t i = 0, j = 0;
+			    i < leaf_nodes_offset_size;
+			    i+= BVH_NODE_LEAF_SIZE, j++)
+			{
 				int4 data = leaf_nodes_offset[i];
 				data.x += prim_offset;
 				data.y += prim_offset;
 				pack_leaf_nodes[pack_leaf_nodes_offset] = data;
-				for(int j = 1; j < nsize_leaf; ++j) {
+				for(int j = 1; j < BVH_NODE_LEAF_SIZE; ++j) {
 					pack_leaf_nodes[pack_leaf_nodes_offset + j] = leaf_nodes_offset[i + j];
 				}
-				pack_leaf_nodes_offset += nsize_leaf;
+				pack_leaf_nodes_offset += BVH_NODE_LEAF_SIZE;
 			}
 		}
 
 		if(bvh->pack.nodes.size()) {
-			/* For QBVH we're packing a child bbox into 6 float4,
-			 * and for regular BVH they're packed into 3 float4.
-			 */
-			size_t nsize_bbox = (use_qbvh)? 6: 3;
 			int4 *bvh_nodes = &bvh->pack.nodes[0];
-			size_t bvh_nodes_size = bvh->pack.nodes.size(); 
+			size_t bvh_nodes_size = bvh->pack.nodes.size();
+
+			for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
+				size_t nsize, nsize_bbox;
+				if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
+					nsize = use_qbvh
+					            ? BVH_UNALIGNED_QNODE_SIZE
+					            : BVH_UNALIGNED_NODE_SIZE;
+					nsize_bbox = (use_qbvh)? 13: 0;
+				}
+				else {
+					nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+					nsize_bbox = (use_qbvh)? 7: 0;
+				}
 
-			for(size_t i = 0, j = 0; i < bvh_nodes_size; i+=nsize, j++) {
-				memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox*sizeof(int4));
+				memcpy(pack_nodes + pack_nodes_offset,
+				       bvh_nodes + i,
+				       nsize_bbox*sizeof(int4));
 
-				/* modify offsets into arrays */
+				/* Modify offsets into arrays */
 				int4 data = bvh_nodes[i + nsize_bbox];
 
-				data.x += (data.x < 0)? -noffset_leaf: noffset;
-				data.y += (data.y < 0)? -noffset_leaf: noffset;
+				data.z += (data.z < 0)? -noffset_leaf: noffset;
+				data.w += (data.w < 0)? -noffset_leaf: noffset;
 
 				if(use_qbvh) {
-					data.z += (data.z < 0)? -noffset_leaf: noffset;
-					data.w += (data.w < 0)? -noffset_leaf: noffset;
+					data.x += (data.x < 0)? -noffset_leaf: noffset;
+					data.y += (data.y < 0)? -noffset_leaf: noffset;
 				}
 
 				pack_nodes[pack_nodes_offset + nsize_bbox] = data;
@@ -366,6 +398,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				       sizeof(int4) * (nsize - (nsize_bbox+1)));
 
 				pack_nodes_offset += nsize;
+				i += nsize;
 			}
 		}
 
@@ -377,12 +410,20 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
 /* Regular BVH */
 
+static bool node_bvh_is_unaligned(const BVHNode *node)
+{
+	const BVHNode *node0 = node->get_child(0),
+	              *node1 = node->get_child(1);
+	return node0->is_unaligned() || node1->is_unaligned();
+}
+
 RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
 }
 
-void RegularBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+void RegularBVH::pack_leaf(const BVHStackEntry& e,
+                           const LeafNode *leaf)
 {
 	float4 data[BVH_NODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
@@ -401,54 +442,130 @@ void RegularBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
 	}
 
-	memcpy(&pack.leaf_nodes[e.idx * BVH_NODE_LEAF_SIZE], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
+}
+
+void RegularBVH::pack_inner(const BVHStackEntry& e,
+                            const BVHStackEntry& e0,
+                            const BVHStackEntry& e1)
+{
+	if (e0.node->is_unaligned() || e1.node->is_unaligned()) {
+		pack_unaligned_inner(e, e0, e1);
+	} else {
+		pack_aligned_inner(e, e0, e1);
+	}
 }
 
-void RegularBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1)
+void RegularBVH::pack_aligned_inner(const BVHStackEntry& e,
+                                    const BVHStackEntry& e0,
+                                    const BVHStackEntry& e1)
 {
-	pack_node(e.idx, e0.node->m_bounds, e1.node->m_bounds, e0.encodeIdx(), e1.encodeIdx(), e0.node->m_visibility, e1.node->m_visibility);
+	pack_aligned_node(e.idx,
+	                  e0.node->m_bounds, e1.node->m_bounds,
+	                  e0.encodeIdx(), e1.encodeIdx(),
+	                  e0.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED,
+	                  e1.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED);
 }
 
-void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int c0, int c1, uint visibility0, uint visibility1)
+void RegularBVH::pack_aligned_node(int idx,
+                                   const BoundBox& b0,
+                                   const BoundBox& b1,
+                                   int c0, int c1,
+                                   uint visibility0, uint visibility1)
 {
 	int4 data[BVH_NODE_SIZE] =
 	{
+		make_int4(visibility0, visibility1, c0, c1),
 		make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)),
 		make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)),
 		make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)),
-		make_int4(c0, c1, visibility0, visibility1)
 	};
 
-	memcpy(&pack.nodes[idx * BVH_NODE_SIZE], data, sizeof(int4)*BVH_NODE_SIZE);
+	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
 }
 
-void RegularBVH::pack_nodes(const BVHNode *root)
+void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
+                                      const BVHStackEntry& e0,
+                                      const BVHStackEntry& e1)
 {
-	size_t tot_node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-	size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-	size_t node_size = tot_node_size - leaf_node_size;
+	pack_unaligned_node(e.idx,
+	                    e0.node->get_aligned_space(),
+	                    e1.node->get_aligned_space(),
+	                    e0.node->m_bounds,
+	                    e1.node->m_bounds,
+	                    e0.encodeIdx(), e1.encodeIdx(),
+	                    e0.node->m_visibility, e1.node->m_visibility);
+}
 
-	/* resize arrays */
-	pack.nodes.clear();
+void RegularBVH::pack_unaligned_node(int idx,
+                                     const Transform& aligned_space0,
+                                     const Transform& aligned_space1,
+                                     const BoundBox& bounds0,
+                                     const BoundBox& bounds1,
+                                     int c0, int c1,
+                                     uint visibility0, uint visibility1)
+{
+	float4 data[BVH_UNALIGNED_NODE_SIZE];
+	Transform space0 = BVHUnaligned::compute_node_transform(bounds0,
+	                                                        aligned_space0);
+	Transform space1 = BVHUnaligned::compute_node_transform(bounds1,
+	                                                        aligned_space1);
+	data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED),
+	                      __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED),
+	                      __int_as_float(c0),
+	                      __int_as_float(c1));
+
+	data[1] = space0.x;
+	data[2] = space0.y;
+	data[3] = space0.z;
+	data[4] = space1.x;
+	data[5] = space1.y;
+	data[6] = space1.z;
+
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
+}
 
-	/* for top level BVH, first merge existing BVH's so we know the offsets */
+void RegularBVH::pack_nodes(const BVHNode *root)
+{
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) +
+		            (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_NODE_SIZE;
+	}
+	/* Resize arrays */
+	pack.nodes.clear();
+	pack.leaf_nodes.clear();
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
 	if(params.top_level) {
-		pack_instances(node_size*BVH_NODE_SIZE,
-		               leaf_node_size*BVH_NODE_LEAF_SIZE);
+		pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE);
 	}
 	else {
-		pack.nodes.resize(node_size*BVH_NODE_SIZE);
-		pack.leaf_nodes.resize(leaf_node_size*BVH_NODE_LEAF_SIZE);
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE);
 	}
 
 	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
 
 	vector<BVHStackEntry> stack;
 	stack.reserve(BVHParams::MAX_DEPTH*2);
-	if(root->is_leaf())
+	if(root->is_leaf()) {
 		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-	else
-		stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+	}
+	else {
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_bvh_is_unaligned(root)
+		                       ? BVH_UNALIGNED_NODE_SIZE
+		                       : BVH_NODE_SIZE;
+	}
 
 	while(stack.size()) {
 		BVHStackEntry e = stack.back();
@@ -456,20 +573,31 @@ void RegularBVH::pack_nodes(const BVHNode *root)
 
 		if(e.node->is_leaf()) {
 			/* leaf node */
-			const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node);
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
 			pack_leaf(e, leaf);
 		}
 		else {
 			/* innner node */
-			int idx0 = (e.node->get_child(0)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++);
-			int idx1 = (e.node->get_child(1)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++);
-			stack.push_back(BVHStackEntry(e.node->get_child(0), idx0));
-			stack.push_back(BVHStackEntry(e.node->get_child(1), idx1));
+			int idx[2];
+			for (int i = 0; i < 2; ++i) {
+				if (e.node->get_child(i)->is_leaf()) {
+					idx[i] = nextLeafNodeIdx++;
+				}
+				else {
+					idx[i] = nextNodeIdx;
+					nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+					                       ? BVH_UNALIGNED_NODE_SIZE
+					                       : BVH_NODE_SIZE;
+				}
+			}
+
+			stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0]));
+			stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1]));
 
 			pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]);
 		}
 	}
-
+	assert(node_size == nextNodeIdx);
 	/* root index to start traversal at, to handle case of single leaf node */
 	pack.root_index = (root->is_leaf())? -1: 0;
 }
@@ -486,7 +614,7 @@ void RegularBVH::refit_nodes()
 void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
 	if(leaf) {
-		int4 *data = &pack.leaf_nodes[idx*BVH_NODE_LEAF_SIZE];
+		int4 *data = &pack.leaf_nodes[idx];
 		int c0 = data[0].x;
 		int c1 = data[0].y;
 		/* refit leaf node */
@@ -565,9 +693,9 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 		       sizeof(float4)*BVH_NODE_LEAF_SIZE);
 	}
 	else {
-		int4 *data = &pack.nodes[idx*BVH_NODE_SIZE];
-		int c0 = data[3].x;
-		int c1 = data[3].y;
+		int4 *data = &pack.nodes[idx];
+		int c0 = data[0].z;
+		int c1 = data[0].w;
 		/* refit inner node, set bbox from children */
 		BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty;
 		uint visibility0 = 0, visibility1 = 0;
@@ -575,7 +703,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 		refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0);
 		refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1);
 
-		pack_node(idx, bbox0, bbox1, c0, c1, visibility0, visibility1);
+		pack_aligned_node(idx, bbox0, bbox1, c0, c1, visibility0, visibility1);
 
 		bbox.grow(bbox0);
 		bbox.grow(bbox1);
@@ -585,6 +713,33 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 
 /* QBVH */
 
+/* Can we avoid this somehow or make more generic?
+ *
+ * Perhaps we can merge nodes in actual tree and make our
+ * life easier all over the place.
+ */
+static bool node_qbvh_is_unaligned(const BVHNode *node)
+{
+	const BVHNode *node0 = node->get_child(0),
+	              *node1 = node->get_child(1);
+	bool has_unaligned = false;
+	if(node0->is_leaf()) {
+		has_unaligned |= node0->is_unaligned();
+	}
+	else {
+		has_unaligned |= node0->get_child(0)->is_unaligned();
+		has_unaligned |= node0->get_child(1)->is_unaligned();
+	}
+	if(node1->is_leaf()) {
+		has_unaligned |= node1->is_unaligned();
+	}
+	else {
+		has_unaligned |= node1->get_child(0)->is_unaligned();
+		has_unaligned |= node1->get_child(1)->is_unaligned();
+	}
+	return has_unaligned;
+}
+
 QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
@@ -610,66 +765,153 @@ void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
 	}
 
-	memcpy(&pack.leaf_nodes[e.idx * BVH_QNODE_LEAF_SIZE], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+}
+
+void QBVH::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry *en,
+                      int num)
+{
+	bool has_unaligned = false;
+	/* Check whether we have to create unaligned node or all nodes are aligned
+	 * and we can cut some corner here.
+	 */
+	if(params.use_unaligned_nodes) {
+		for(int i = 0; i < num; i++) {
+			if(en[i].node->is_unaligned()) {
+				has_unaligned = true;
+				break;
+			}
+		}
+	}
+	if(has_unaligned) {
+		/* There's no unaligned children, pack into AABB node. */
+		pack_unaligned_inner(e, en, num);
+	}
+	else {
+		/* Create unaligned node with orientation transform for each of the
+		 * children.
+		 */
+		pack_aligned_inner(e, en, num);
+	}
 }
 
-void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
+void QBVH::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry *en,
+                              int num)
 {
 	float4 data[BVH_QNODE_SIZE];
+	memset(data, 0, sizeof(data));
 
+	data[0].x = __uint_as_float(e.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED);
 	for(int i = 0; i < num; i++) {
 		float3 bb_min = en[i].node->m_bounds.min;
 		float3 bb_max = en[i].node->m_bounds.max;
 
-		data[0][i] = bb_min.x;
-		data[1][i] = bb_max.x;
-		data[2][i] = bb_min.y;
-		data[3][i] = bb_max.y;
-		data[4][i] = bb_min.z;
-		data[5][i] = bb_max.z;
+		data[1][i] = bb_min.x;
+		data[2][i] = bb_max.x;
+		data[3][i] = bb_min.y;
+		data[4][i] = bb_max.y;
+		data[5][i] = bb_min.z;
+		data[6][i] = bb_max.z;
 
-		data[6][i] = __int_as_float(en[i].encodeIdx());
+		data[7][i] = __int_as_float(en[i].encodeIdx());
 	}
 
 	for(int i = num; i < 4; i++) {
 		/* We store BB which would never be recorded as intersection
 		 * so kernel might safely assume there are always 4 child nodes.
 		 */
-		data[0][i] = FLT_MAX;
-		data[1][i] = -FLT_MAX;
+		data[1][i] = FLT_MAX;
+		data[2][i] = -FLT_MAX;
+
+		data[3][i] = FLT_MAX;
+		data[4][i] = -FLT_MAX;
+
+		data[5][i] = FLT_MAX;
+		data[6][i] = -FLT_MAX;
+
+		data[7][i] = __int_as_float(0);
+	}
+
+	memcpy(&pack.nodes[e.idx], data, sizeof(float4)*BVH_QNODE_SIZE);
+}
+
+void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry *en,
+                                int num)
+{
+	float4 data[BVH_UNALIGNED_QNODE_SIZE];
+	memset(data, 0, sizeof(data));
+
+	data[0].x = __uint_as_float(e.node->m_visibility | PATH_RAY_NODE_UNALIGNED);
+
+	for(int i = 0; i < num; i++) {
+		Transform space = BVHUnaligned::compute_node_transform(
+		        en[i].node->m_bounds,
+		        en[i].node->get_aligned_space());
+
+		data[1][i] = space.x.x;
+		data[2][i] = space.x.y;
+		data[3][i] = space.x.z;
+
+		data[4][i] = space.y.x;
+		data[5][i] = space.y.y;
+		data[6][i] = space.y.z;
+
+		data[7][i] = space.z.x;
+		data[8][i] = space.z.y;
+		data[9][i] = space.z.z;
 
-		data[2][i] = FLT_MAX;
-		data[3][i] = -FLT_MAX;
+		data[10][i] = space.x.w;
+		data[11][i] = space.y.w;
+		data[12][i] = space.z.w;
 
-		data[4][i] = FLT_MAX;
-		data[5][i] = -FLT_MAX;
+		data[13][i] = __int_as_float(en[i].encodeIdx());
+	}
 
-		data[6][i] = __int_as_float(0);
+	for(int i = num; i < 4; i++) {
+		/* We store BB which would never be recorded as intersection
+		 * so kernel might safely assume there are always 4 child nodes.
+		 */
+		for(int j = 1; j < 13; ++j) {
+			data[j][i] = 0.0f;
+		}
+		data[13][i] = __int_as_float(0);
 	}
 
-	memcpy(&pack.nodes[e.idx * BVH_QNODE_SIZE], data, sizeof(float4)*BVH_QNODE_SIZE);
+	memcpy(&pack.nodes[e.idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
 }
 
 /* Quad SIMD Nodes */
 
 void QBVH::pack_nodes(const BVHNode *root)
 {
-	size_t tot_node_size = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
-	size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-	size_t node_size = tot_node_size - leaf_node_size;
-
-	/* resize arrays */
+	/* Calculate size of the arrays required. */
+	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT);
+	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+	assert(num_leaf_nodes <= num_nodes);
+	const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+	size_t node_size;
+	if(params.use_unaligned_nodes) {
+		const size_t num_unaligned_nodes =
+		        root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT);
+		node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
+		            (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
+	}
+	else {
+		node_size = num_inner_nodes * BVH_QNODE_SIZE;
+	}
+	/* Resize arrays. */
 	pack.nodes.clear();
 	pack.leaf_nodes.clear();
-
-	/* for top level BVH, first merge existing BVH's so we know the offsets */
+	/* For top level BVH, first merge existing BVH's so we know the offsets. */
 	if(params.top_level) {
-		pack_instances(node_size*BVH_QNODE_SIZE,
-		               leaf_node_size*BVH_QNODE_LEAF_SIZE);
+		pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
 	}
 	else {
-		pack.nodes.resize(node_size*BVH_QNODE_SIZE);
-		pack.leaf_nodes.resize(leaf_node_size*BVH_QNODE_LEAF_SIZE);
+		pack.nodes.resize(node_size);
+		pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE);
 	}
 
 	int nextNodeIdx = 0, nextLeafNodeIdx = 0;
@@ -680,7 +922,10 @@ void QBVH::pack_nodes(const BVHNode *root)
 		stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
 	}
 	else {
-		stack.push_back(BVHStackEntry(root, nextNodeIdx++));
+		stack.push_back(BVHStackEntry(root, nextNodeIdx));
+		nextNodeIdx += node_qbvh_is_unaligned(root)
+		                       ? BVH_UNALIGNED_QNODE_SIZE
+		                       : BVH_QNODE_SIZE;
 	}
 
 	while(stack.size()) {
@@ -689,19 +934,17 @@ void QBVH::pack_nodes(const BVHNode *root)
 
 		if(e.node->is_leaf()) {
 			/* leaf node */
-			const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node);
+			const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
 			pack_leaf(e, leaf);
 		}
 		else {
-			/* inner node */
+			/* Inner node. */
 			const BVHNode *node = e.node;
 			const BVHNode *node0 = node->get_child(0);
 			const BVHNode *node1 = node->get_child(1);
-
-			/* collect nodes */
+			/* Collect nodes. */
 			const BVHNode *nodes[4];
 			int numnodes = 0;
-
 			if(node0->is_leaf()) {
 				nodes[numnodes++] = node0;
 			}
@@ -709,7 +952,6 @@ void QBVH::pack_nodes(const BVHNode *root)
 				nodes[numnodes++] = node0->get_child(0);
 				nodes[numnodes++] = node0->get_child(1);
 			}
-
 			if(node1->is_leaf()) {
 				nodes[numnodes++] = node1;
 			}
@@ -717,25 +959,26 @@ void QBVH::pack_nodes(const BVHNode *root)
 				nodes[numnodes++] = node1->get_child(0);
 				nodes[numnodes++] = node1->get_child(1);
 			}
-
-			/* push entries on the stack */
-			for(int i = 0; i < numnodes; i++) {
+			/* Push entries on the stack. */
+			for(int i = 0; i < numnodes; ++i) {
 				int idx;
 				if(nodes[i]->is_leaf()) {
 					idx = nextLeafNodeIdx++;
 				}
 				else {
-					idx = nextNodeIdx++;
+					idx = nextNodeIdx;
+					nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+					                       ? BVH_UNALIGNED_QNODE_SIZE
+					                       : BVH_QNODE_SIZE;
 				}
 				stack.push_back(BVHStackEntry(nodes[i], idx));
 			}
-
-			/* set node */
+			/* Set node. */
 			pack_inner(e, &stack[stack.size()-numnodes], numnodes);
 		}
 	}
-
-	/* root index to start traversal at, to handle case of single leaf node */
+	assert(node_size == nextNodeIdx);
+	/* Root index to start traversal at, to handle case of single leaf node. */
 	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
@@ -751,7 +994,7 @@ void QBVH::refit_nodes()
 void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
 	if(leaf) {
-		int4 *data = &pack.leaf_nodes[idx*BVH_QNODE_LEAF_SIZE];
+		int4 *data = &pack.leaf_nodes[idx];
 		int4 c = data[0];
 		/* Refit leaf node. */
 		for(int prim = c.x; prim < c.y; prim++) {
@@ -833,13 +1076,18 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 		leaf_data[0].y = __int_as_float(c.y);
 		leaf_data[0].z = __uint_as_float(visibility);
 		leaf_data[0].w = __uint_as_float(c.w);
-		memcpy(&pack.leaf_nodes[idx * BVH_QNODE_LEAF_SIZE],
-		       leaf_data,
-		       sizeof(float4)*BVH_QNODE_LEAF_SIZE);
+		memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
 	}
 	else {
-		int4 *data = &pack.nodes[idx*BVH_QNODE_SIZE];
-		int4 c = data[6];
+		int4 *data = &pack.nodes[idx];
+		bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+		int4 c;
+		if(is_unaligned) {
+			c = data[13];
+		}
+		else {
+			c = data[7];
+		}
 		/* Refit inner node, set bbox from children. */
 		BoundBox child_bbox[4] = {BoundBox::empty,
 		                          BoundBox::empty,
@@ -858,21 +1106,62 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			}
 		}
 
-		float4 inner_data[BVH_QNODE_SIZE];
-		for(int i = 0; i < 4; ++i) {
-			float3 bb_min = child_bbox[i].min;
-			float3 bb_max = child_bbox[i].max;
-			inner_data[0][i] = bb_min.x;
-			inner_data[1][i] = bb_max.x;
-			inner_data[2][i] = bb_min.y;
-			inner_data[3][i] = bb_max.y;
-			inner_data[4][i] = bb_min.z;
-			inner_data[5][i] = bb_max.z;
-			inner_data[6][i] = __int_as_float(c[i]);
+		/* TODO(sergey): To be de-duplicated with pack_inner(),
+		 * but for that need some sort of pack_node(). which operates with
+		 * direct data, not stack element.
+		 */
+		if(is_unaligned) {
+			Transform aligned_space = transform_identity();
+			float4 inner_data[BVH_UNALIGNED_QNODE_SIZE];
+			inner_data[0] = make_float4(
+			        __int_as_float(visibility | PATH_RAY_NODE_UNALIGNED),
+			        0.0f,
+			        0.0f,
+			        0.0f);
+			for(int i = 0; i < 4; ++i) {
+				Transform space = BVHUnaligned::compute_node_transform(
+				        child_bbox[i],
+				        aligned_space);
+				inner_data[1][i] = space.x.x;
+				inner_data[2][i] = space.x.y;
+				inner_data[3][i] = space.x.z;
+
+				inner_data[4][i] = space.y.x;
+				inner_data[5][i] = space.y.y;
+				inner_data[6][i] = space.y.z;
+
+				inner_data[7][i] = space.z.x;
+				inner_data[8][i] = space.z.y;
+				inner_data[9][i] = space.z.z;
+
+				inner_data[10][i] = space.x.w;
+				inner_data[11][i] = space.y.w;
+				inner_data[12][i] = space.z.w;
+
+				inner_data[13][i] = __int_as_float(c[i]);
+			}
+			memcpy(&pack.nodes[idx], inner_data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
+		}
+		else {
+			float4 inner_data[BVH_QNODE_SIZE];
+			inner_data[0] = make_float4(
+			        __int_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED),
+			        0.0f,
+			        0.0f,
+			        0.0f);
+			for(int i = 0; i < 4; ++i) {
+				float3 bb_min = child_bbox[i].min;
+				float3 bb_max = child_bbox[i].max;
+				inner_data[1][i] = bb_min.x;
+				inner_data[2][i] = bb_max.x;
+				inner_data[3][i] = bb_min.y;
+				inner_data[4][i] = bb_max.y;
+				inner_data[5][i] = bb_min.z;
+				inner_data[6][i] = bb_max.z;
+				inner_data[7][i] = __int_as_float(c[i]);
+			}
+			memcpy(&pack.nodes[idx], inner_data, sizeof(float4)*BVH_QNODE_SIZE);
 		}
-		memcpy(&pack.nodes[idx * BVH_QNODE_SIZE],
-		       inner_data,
-		       sizeof(float4)*BVH_QNODE_SIZE);
 	}
 }
 
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 6076c25ca31..16752076f6a 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -35,11 +35,14 @@ class Progress;
 
 #define BVH_NODE_SIZE	4
 #define BVH_NODE_LEAF_SIZE	1
-#define BVH_QNODE_SIZE	7
+#define BVH_QNODE_SIZE	8
 #define BVH_QNODE_LEAF_SIZE	1
 #define BVH_ALIGN		4096
 #define TRI_NODE_SIZE	3
 
+#define BVH_UNALIGNED_NODE_SIZE 7
+#define BVH_UNALIGNED_QNODE_SIZE 14
+
 /* Packed BVH
  *
  * BVH stored as it will be used for traversal on the rendering device. */
@@ -52,8 +55,10 @@ struct PackedBVH {
 	array<int4> leaf_nodes;
 	/* object index to BVH node index mapping for instances */
 	array<int> object_node; 
-	/* Aligned triangle storage for fatser lookup in the kernel. */
-	array<float4> tri_storage;
+	/* Mapping from primitive index to index in triangle array. */
+	array<uint> prim_tri_index;
+	/* Continuous storage of triangle vertices. */
+	array<float4> prim_tri_verts;
 	/* primitive type - triangle or strand */
 	array<int> prim_type;
 	/* visibility visibilitys for primitives */
@@ -91,7 +96,7 @@ public:
 protected:
 	BVH(const BVHParams& params, const vector<Object*>& objects);
 
-	/* triangles and strands*/
+	/* triangles and strands */
 	void pack_primitives();
 	void pack_triangle(int idx, float4 storage[3]);
 
@@ -115,9 +120,32 @@ protected:
 
 	/* pack */
 	void pack_nodes(const BVHNode *root);
-	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
-	void pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1);
-	void pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int c0, int c1, uint visibility0, uint visibility1);
+
+	void pack_leaf(const BVHStackEntry& e,
+	               const LeafNode *leaf);
+	void pack_inner(const BVHStackEntry& e,
+	                const BVHStackEntry& e0,
+	                const BVHStackEntry& e1);
+
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry& e0,
+	                        const BVHStackEntry& e1);
+	void pack_aligned_node(int idx,
+	                       const BoundBox& b0,
+	                       const BoundBox& b1,
+	                       int c0, int c1,
+	                       uint visibility0, uint visibility1);
+
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry& e0,
+	                          const BVHStackEntry& e1);
+	void pack_unaligned_node(int idx,
+	                         const Transform& aligned_space0,
+	                         const Transform& aligned_space1,
+	                         const BoundBox& b0,
+	                         const BoundBox& b1,
+	                         int c0, int c1,
+	                         uint visibility0, uint visibility1);
 
 	/* refit */
 	void refit_nodes();
@@ -136,9 +164,17 @@ protected:
 
 	/* pack */
 	void pack_nodes(const BVHNode *root);
+
 	void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
 	void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
 
+	void pack_aligned_inner(const BVHStackEntry& e,
+	                        const BVHStackEntry *en,
+	                        int num);
+	void pack_unaligned_inner(const BVHStackEntry& e,
+	                          const BVHStackEntry *en,
+	                          int num);
+
 	/* refit */
 	void refit_nodes();
 	void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index b07e870d759..5ddd7349f7b 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -52,12 +52,35 @@ __forceinline int get_best_dimension(const float4& bestSAH)
 
 /* BVH Object Binning */
 
-BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
-: BVHRange(job), splitSAH(FLT_MAX), dim(0), pos(0)
+BVHObjectBinning::BVHObjectBinning(const BVHRange& job,
+                                   BVHReference *prims,
+                                   const BVHUnaligned *unaligned_heuristic,
+                                   const Transform *aligned_space)
+: BVHRange(job),
+  splitSAH(FLT_MAX),
+  dim(0),
+  pos(0),
+  unaligned_heuristic_(unaligned_heuristic),
+  aligned_space_(aligned_space)
 {
+	if(aligned_space_ == NULL) {
+		bounds_ = bounds();
+		cent_bounds_ = cent_bounds();
+	}
+	else {
+		/* TODO(sergey): With some additional storage we can avoid
+		 * need in re-calculating this.
+		 */
+		bounds_ = unaligned_heuristic->compute_aligned_boundbox(
+		        *this,
+		        prims,
+		        *aligned_space,
+		        &cent_bounds_);
+	}
+
 	/* compute number of bins to use and precompute scaling factor for binning */
 	num_bins = min(size_t(MAX_BINS), size_t(4.0f + 0.05f*size()));
-	scale = rcp(cent_bounds().size()) * make_float3((float)num_bins);
+	scale = rcp(cent_bounds_.size()) * make_float3((float)num_bins);
 
 	/* initialize binning counter and bounds */
 	BoundBox bin_bounds[MAX_BINS][4];	/* bounds for every bin in every dimension */
@@ -79,30 +102,34 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
 			const BVHReference& prim0 = prims[start() + i + 0];
 			const BVHReference& prim1 = prims[start() + i + 1];
 
-			int4 bin0 = get_bin(prim0.bounds());
-			int4 bin1 = get_bin(prim1.bounds());
+			BoundBox bounds0 = get_prim_bounds(prim0);
+			BoundBox bounds1 = get_prim_bounds(prim1);
+
+			int4 bin0 = get_bin(bounds0);
+			int4 bin1 = get_bin(bounds1);
 
 			/* increase bounds for bins for even primitive */
-			int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
-			int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
-			int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
+			int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(bounds0);
+			int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(bounds0);
+			int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(bounds0);
 
 			/* increase bounds of bins for odd primitive */
-			int b10 = (int)extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(prim1.bounds());
-			int b11 = (int)extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(prim1.bounds());
-			int b12 = (int)extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(prim1.bounds());
+			int b10 = (int)extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(bounds1);
+			int b11 = (int)extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(bounds1);
+			int b12 = (int)extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(bounds1);
 		}
 
 		/* for uneven number of primitives */
 		if(i < ssize_t(size())) {
 			/* map primitive to bin */
 			const BVHReference& prim0 = prims[start() + i];
-			int4 bin0 = get_bin(prim0.bounds());
+			BoundBox bounds0 = get_prim_bounds(prim0);
+			int4 bin0 = get_bin(bounds0);
 
 			/* increase bounds of bins */
-			int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
-			int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
-			int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
+			int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(bounds0);
+			int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(bounds0);
+			int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(bounds0);
 		}
 	}
 
@@ -151,17 +178,19 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
 		bestSAH = min(sah,bestSAH);
 	}
 
-	int4 mask = float3_to_float4(cent_bounds().size()) <= make_float4(0.0f);
+	int4 mask = float3_to_float4(cent_bounds_.size()) <= make_float4(0.0f);
 	bestSAH = insert<3>(select(mask, make_float4(FLT_MAX), bestSAH), FLT_MAX);
 
 	/* find best dimension */
 	dim = get_best_dimension(bestSAH);
 	splitSAH = bestSAH[dim];
 	pos = bestSplit[dim];
-	leafSAH	= bounds().half_area() * blocks(size());
+	leafSAH = bounds_.half_area() * blocks(size());
 }
 
-void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHObjectBinning& right_o) const
+void BVHObjectBinning::split(BVHReference* prims,
+                             BVHObjectBinning& left_o,
+                             BVHObjectBinning& right_o) const
 {
 	size_t N = size();
 
@@ -176,10 +205,12 @@ void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHO
 		prefetch_L2(&prims[start() + l + 8]);
 		prefetch_L2(&prims[start() + r - 8]);
 
-		const BVHReference& prim = prims[start() + l];
+		BVHReference prim = prims[start() + l];
+		BoundBox unaligned_bounds = get_prim_bounds(prim);
+		float3 unaligned_center = unaligned_bounds.center2();
 		float3 center = prim.bounds().center2();
 
-		if(get_bin(center)[dim] < pos) {
+		if(get_bin(unaligned_center)[dim] < pos) {
 			lgeom_bounds.grow(prim.bounds());
 			lcent_bounds.grow(center);
 			l++;
@@ -191,7 +222,6 @@ void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHO
 			r--;
 		}
 	}
-
 	/* finish */
 	if(l != 0 && N-1-r != 0) {
 		right_o = BVHObjectBinning(BVHRange(rgeom_bounds, rcent_bounds, start() + l, N-1-r), prims);
diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h
index 60742157055..52955f70151 100644
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -19,11 +19,14 @@
 #define __BVH_BINNING_H__
 
 #include "bvh_params.h"
+#include "bvh_unaligned.h"
 
 #include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
+class BVHBuild;
+
 /* Single threaded object binner. Finds the split with the best SAH heuristic
  * by testing for each dimension multiple partitionings for regular spaced
  * partition locations. A partitioning for a partition location is computed,
@@ -34,10 +37,18 @@ CCL_NAMESPACE_BEGIN
 class BVHObjectBinning : public BVHRange
 {
 public:
-	__forceinline BVHObjectBinning() {}
-	BVHObjectBinning(const BVHRange& job, BVHReference *prims);
+	__forceinline BVHObjectBinning() : leafSAH(FLT_MAX) {}
+
+	BVHObjectBinning(const BVHRange& job,
+	                 BVHReference *prims,
+	                 const BVHUnaligned *unaligned_heuristic = NULL,
+	                 const Transform *aligned_space = NULL);
 
-	void split(BVHReference *prims, BVHObjectBinning& left_o, BVHObjectBinning& right_o) const;
+	void split(BVHReference *prims,
+	           BVHObjectBinning& left_o,
+	           BVHObjectBinning& right_o) const;
+
+	__forceinline const BoundBox& unaligned_bounds() { return bounds_; }
 
 	float splitSAH;	/* SAH cost of the best split */
 	float leafSAH;	/* SAH cost of creating a leaf */
@@ -48,13 +59,20 @@ protected:
 	size_t num_bins;	/* actual number of bins to use */
 	float3 scale;		/* scaling factor to compute bin */
 
+	/* Effective bounds and centroid bounds. */
+	BoundBox bounds_;
+	BoundBox cent_bounds_;
+
+	const BVHUnaligned *unaligned_heuristic_;
+	const Transform *aligned_space_;
+
 	enum { MAX_BINS = 32 };
 	enum { LOG_BLOCK_SIZE = 2 };
 
 	/* computes the bin numbers for each dimension for a box. */
 	__forceinline int4 get_bin(const BoundBox& box) const
 	{
-		int4 a = make_int4((box.center2() - cent_bounds().min)*scale - make_float3(0.5f));
+		int4 a = make_int4((box.center2() - cent_bounds_.min)*scale - make_float3(0.5f));
 		int4 mn = make_int4(0);
 		int4 mx = make_int4((int)num_bins-1);
 
@@ -64,7 +82,7 @@ protected:
 	/* computes the bin numbers for each dimension for a point. */
 	__forceinline int4 get_bin(const float3& c) const
 	{
-		return make_int4((c - cent_bounds().min)*scale - make_float3(0.5f));
+		return make_int4((c - cent_bounds_.min)*scale - make_float3(0.5f));
 	}
 
 	/* compute the number of blocks occupied for each dimension. */
@@ -78,6 +96,17 @@ protected:
 	{
 		return (int)((a+((1LL << LOG_BLOCK_SIZE)-1)) >> LOG_BLOCK_SIZE);
 	}
+
+	__forceinline BoundBox get_prim_bounds(const BVHReference& prim) const
+	{
+		if(aligned_space_ == NULL) {
+			return prim.bounds();
+		}
+		else {
+			return unaligned_heuristic_->compute_aligned_prim_boundbox(
+			        prim, *aligned_space_);
+		}
+	}
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 3f687224eee..67ffb6853d6 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -33,6 +33,7 @@
 #include "util_stack_allocator.h"
 #include "util_simd.h"
 #include "util_time.h"
+#include "util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -99,7 +100,8 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
    prim_object(prim_object_),
    params(params_),
    progress(progress_),
-   progress_start_time(0.0)
+   progress_start_time(0.0),
+   unaligned_heuristic(objects_)
 {
 	spatial_min_overlap = 0.0f;
 }
@@ -112,70 +114,74 @@ BVHBuild::~BVHBuild()
 
 void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
 {
-	Attribute *attr_mP = NULL;
-	
-	if(mesh->has_motion_blur())
-		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+	if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+		Attribute *attr_mP = NULL;
 
-	size_t num_triangles = mesh->num_triangles();
-	for(uint j = 0; j < num_triangles; j++) {
-		Mesh::Triangle t = mesh->get_triangle(j);
-		BoundBox bounds = BoundBox::empty;
-		PrimitiveType type = PRIMITIVE_TRIANGLE;
+		if(mesh->has_motion_blur())
+			attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+		size_t num_triangles = mesh->num_triangles();
+		for(uint j = 0; j < num_triangles; j++) {
+			Mesh::Triangle t = mesh->get_triangle(j);
+			BoundBox bounds = BoundBox::empty;
+			PrimitiveType type = PRIMITIVE_TRIANGLE;
 
-		t.bounds_grow(&mesh->verts[0], bounds);
+			t.bounds_grow(&mesh->verts[0], bounds);
 
-		/* motion triangles */
-		if(attr_mP) {
-			size_t mesh_size = mesh->verts.size();
-			size_t steps = mesh->motion_steps - 1;
-			float3 *vert_steps = attr_mP->data_float3();
+			/* motion triangles */
+			if(attr_mP) {
+				size_t mesh_size = mesh->verts.size();
+				size_t steps = mesh->motion_steps - 1;
+				float3 *vert_steps = attr_mP->data_float3();
 
-			for(size_t i = 0; i < steps; i++)
-				t.bounds_grow(vert_steps + i*mesh_size, bounds);
+				for(size_t i = 0; i < steps; i++)
+					t.bounds_grow(vert_steps + i*mesh_size, bounds);
 
-			type = PRIMITIVE_MOTION_TRIANGLE;
-		}
+				type = PRIMITIVE_MOTION_TRIANGLE;
+			}
 
-		if(bounds.valid()) {
-			references.push_back(BVHReference(bounds, j, i, type));
-			root.grow(bounds);
-			center.grow(bounds.center2());
+			if(bounds.valid()) {
+				references.push_back(BVHReference(bounds, j, i, type));
+				root.grow(bounds);
+				center.grow(bounds.center2());
+			}
 		}
 	}
 
-	Attribute *curve_attr_mP = NULL;
+	if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+		Attribute *curve_attr_mP = NULL;
 
-	if(mesh->has_motion_blur())
-		curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(mesh->has_motion_blur())
+			curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
-	size_t num_curves = mesh->num_curves();
-	for(uint j = 0; j < num_curves; j++) {
-		Mesh::Curve curve = mesh->get_curve(j);
-		PrimitiveType type = PRIMITIVE_CURVE;
+		size_t num_curves = mesh->num_curves();
+		for(uint j = 0; j < num_curves; j++) {
+			Mesh::Curve curve = mesh->get_curve(j);
+			PrimitiveType type = PRIMITIVE_CURVE;
 
-		for(int k = 0; k < curve.num_keys - 1; k++) {
-			BoundBox bounds = BoundBox::empty;
-			curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds);
+			for(int k = 0; k < curve.num_keys - 1; k++) {
+				BoundBox bounds = BoundBox::empty;
+				curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds);
 
-			/* motion curve */
-			if(curve_attr_mP) {
-				size_t mesh_size = mesh->curve_keys.size();
-				size_t steps = mesh->motion_steps - 1;
-				float3 *key_steps = curve_attr_mP->data_float3();
+				/* motion curve */
+				if(curve_attr_mP) {
+					size_t mesh_size = mesh->curve_keys.size();
+					size_t steps = mesh->motion_steps - 1;
+					float3 *key_steps = curve_attr_mP->data_float3();
 
-				for(size_t i = 0; i < steps; i++)
-					curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds);
+					for(size_t i = 0; i < steps; i++)
+						curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds);
 
-				type = PRIMITIVE_MOTION_CURVE;
-			}
+					type = PRIMITIVE_MOTION_CURVE;
+				}
 
-			if(bounds.valid()) {
-				int packed_type = PRIMITIVE_PACK_SEGMENT(type, k);
-				
-				references.push_back(BVHReference(bounds, j, i, packed_type));
-				root.grow(bounds);
-				center.grow(bounds.center2());
+				if(bounds.valid()) {
+					int packed_type = PRIMITIVE_PACK_SEGMENT(type, k);
+
+					references.push_back(BVHReference(bounds, j, i, packed_type));
+					root.grow(bounds);
+					center.grow(bounds.center2());
+				}
 			}
 		}
 	}
@@ -209,15 +215,23 @@ void BVHBuild::add_references(BVHRange& root)
 				continue;
 			}
 			if(!ob->mesh->is_instanced()) {
-				num_alloc_references += ob->mesh->num_triangles();
-				num_alloc_references += count_curve_segments(ob->mesh);
+				if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+					num_alloc_references += ob->mesh->num_triangles();
+				}
+				if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+					num_alloc_references += count_curve_segments(ob->mesh);
+				}
 			}
 			else
 				num_alloc_references++;
 		}
 		else {
-			num_alloc_references += ob->mesh->num_triangles();
-			num_alloc_references += count_curve_segments(ob->mesh);
+			if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+				num_alloc_references += ob->mesh->num_triangles();
+			}
+			if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+				num_alloc_references += count_curve_segments(ob->mesh);
+			}
 		}
 	}
 
@@ -340,6 +354,8 @@ BVHNode* BVHBuild::run()
 			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_INNER_COUNT)) << "\n"
 			        << "  Number of leaf nodes: "
 			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_LEAF_COUNT)) << "\n"
+			        << "  Number of unaligned nodes: "
+			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_UNALIGNED_COUNT))  << "\n"
 			        << "  Allocation slop factor: "
 			               << ((prim_type.capacity() != 0)
 			                       ? (float)prim_type.size() / prim_type.capacity()
@@ -445,10 +461,11 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
 	float leafSAH = params.sah_primitive_cost * range.leafSAH;
 	float splitSAH = params.sah_node_cost * range.bounds().half_area() + params.sah_primitive_cost * range.splitSAH;
 
-	/* have at least one inner node on top level, for performance and correct
-	 * visibility tests, since object instances do not check visibility flag */
+	/* Have at least one inner node on top level, for performance and correct
+	 * visibility tests, since object instances do not check visibility flag.
+	 */
 	if(!(range.size() > 0 && params.top_level && level == 0)) {
-		/* make leaf node when threshold reached or SAH tells us */
+		/* Make leaf node when threshold reached or SAH tells us. */
 		if((params.small_enough_for_leaf(size, level)) ||
 		   (range_within_max_leaf_size(range, references) && leafSAH < splitSAH))
 		{
@@ -456,28 +473,70 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
 		}
 	}
 
-	/* perform split */
+	BVHObjectBinning unaligned_range;
+	float unalignedSplitSAH = FLT_MAX;
+	float unalignedLeafSAH = FLT_MAX;
+	Transform aligned_space;
+	if(params.use_unaligned_nodes &&
+	   splitSAH > params.unaligned_split_threshold*leafSAH)
+	{
+		aligned_space = unaligned_heuristic.compute_aligned_space(
+		        range, &references[0]);
+		unaligned_range = BVHObjectBinning(range,
+		                                   &references[0],
+		                                   &unaligned_heuristic,
+		                                   &aligned_space);
+		unalignedSplitSAH = params.sah_node_cost * unaligned_range.unaligned_bounds().half_area() +
+		                    params.sah_primitive_cost * unaligned_range.splitSAH;
+		unalignedLeafSAH = params.sah_primitive_cost * unaligned_range.leafSAH;
+		if(!(range.size() > 0 && params.top_level && level == 0)) {
+			if(unalignedLeafSAH < unalignedSplitSAH && unalignedSplitSAH < splitSAH &&
+			   range_within_max_leaf_size(range, references))
+			{
+				return create_leaf_node(range, references);
+			}
+		}
+	}
+
+	/* Perform split. */
 	BVHObjectBinning left, right;
-	range.split(&references[0], left, right);
+	if(unalignedSplitSAH < splitSAH) {
+		unaligned_range.split(&references[0], left, right);
+	}
+	else {
+		range.split(&references[0], left, right);
+	}
 
-	/* create inner node. */
-	InnerNode *inner;
+	BoundBox bounds;
+	if(unalignedSplitSAH < splitSAH) {
+		bounds = unaligned_heuristic.compute_aligned_boundbox(
+		        range, &references[0], aligned_space);
+	}
+	else {
+		bounds = range.bounds();
+	}
 
+	/* Create inner node. */
+	InnerNode *inner;
 	if(range.size() < THREAD_TASK_SIZE) {
 		/* local build */
 		BVHNode *leftnode = build_node(left, level + 1);
 		BVHNode *rightnode = build_node(right, level + 1);
 
-		inner = new InnerNode(range.bounds(), leftnode, rightnode);
+		inner = new InnerNode(bounds, leftnode, rightnode);
 	}
 	else {
-		/* threaded build */
-		inner = new InnerNode(range.bounds());
+		/* Threaded build */
+		inner = new InnerNode(bounds);
 
 		task_pool.push(new BVHBuildTask(this, inner, 0, left, level + 1), true);
 		task_pool.push(new BVHBuildTask(this, inner, 1, right, level + 1), true);
 	}
 
+	if(unalignedSplitSAH < splitSAH) {
+		inner->set_aligned_space(aligned_space);
+	}
+
 	return inner;
 }
 
@@ -516,16 +575,54 @@ BVHNode* BVHBuild::build_node(const BVHRange& range,
 			return create_leaf_node(range, *references);
 		}
 	}
+	float leafSAH = params.sah_primitive_cost * split.leafSAH;
+	float splitSAH = params.sah_node_cost * range.bounds().half_area() +
+	                 params.sah_primitive_cost * split.nodeSAH;
+
+	BVHMixedSplit unaligned_split;
+	float unalignedSplitSAH = FLT_MAX;
+	/* float unalignedLeafSAH = FLT_MAX; */
+	Transform aligned_space;
+	if(params.use_unaligned_nodes &&
+	   splitSAH > params.unaligned_split_threshold*leafSAH)
+	{
+		aligned_space =
+		        unaligned_heuristic.compute_aligned_space(range, &references->at(0));
+		unaligned_split = BVHMixedSplit(this,
+		                                storage,
+		                                range,
+		                                references,
+		                                level,
+		                                &unaligned_heuristic,
+		                                &aligned_space);
+		/* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */
+		unalignedSplitSAH = params.sah_node_cost * unaligned_split.bounds.half_area() +
+		                    params.sah_primitive_cost * unaligned_split.nodeSAH;
+		/* TOOD(sergey): Check we can create leaf already. */
+	}
 
 	/* Do split. */
 	BVHRange left, right;
-	split.split(this, left, right, range);
+	if(unalignedSplitSAH < splitSAH) {
+		unaligned_split.split(this, left, right, range);
+	}
+	else {
+		split.split(this, left, right, range);
+	}
 
 	progress_total += left.size() + right.size() - range.size();
 
+	BoundBox bounds;
+	if(unalignedSplitSAH < splitSAH) {
+		bounds = unaligned_heuristic.compute_aligned_boundbox(
+		        range, &references->at(0), aligned_space);
+	}
+	else {
+		bounds = range.bounds();
+	}
+
 	/* Create inner node. */
 	InnerNode *inner;
-
 	if(range.size() < THREAD_TASK_SIZE) {
 		/* Local build. */
 
@@ -539,11 +636,11 @@ BVHNode* BVHBuild::build_node(const BVHRange& range,
 		/* Build right node. */
 		BVHNode *rightnode = build_node(right, &copy, level + 1, thread_id);
 
-		inner = new InnerNode(range.bounds(), leftnode, rightnode);
+		inner = new InnerNode(bounds, leftnode, rightnode);
 	}
 	else {
 		/* Threaded build. */
-		inner = new InnerNode(range.bounds());
+		inner = new InnerNode(bounds);
 		task_pool.push(new BVHSpatialSplitBuildTask(this,
 		                                            inner,
 		                                            0,
@@ -560,6 +657,10 @@ BVHNode* BVHBuild::build_node(const BVHRange& range,
 		               true);
 	}
 
+	if(unalignedSplitSAH < splitSAH) {
+		inner->set_aligned_space(aligned_space);
+	}
+
 	return inner;
 }
 
@@ -616,6 +717,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
+	vector<BVHReference, LeafStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
 
 	/* TODO(sergey): In theory we should be able to store references. */
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
@@ -634,6 +736,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		const BVHReference& ref = references[range.start() + i];
 		if(ref.prim_index() != -1) {
 			int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL);
+			p_ref[type_index].push_back(ref);
 			p_type[type_index].push_back(ref.prim_type());
 			p_index[type_index].push_back(ref.prim_index());
 			p_object[type_index].push_back(ref.prim_object());
@@ -674,16 +777,38 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		if(num != 0) {
 			assert(p_type[i].size() == p_index[i].size());
 			assert(p_type[i].size() == p_object[i].size());
+			Transform aligned_space;
+			bool alignment_found = false;
 			for(int j = 0; j < num; ++j) {
 				const int index = start_index + j;
 				local_prim_type[index] = p_type[i][j];
 				local_prim_index[index] = p_index[i][j];
 				local_prim_object[index] = p_object[i][j];
+				if(params.use_unaligned_nodes && !alignment_found) {
+					alignment_found =
+						unaligned_heuristic.compute_aligned_space(p_ref[i][j],
+						                                           &aligned_space);
+				}
+			}
+			LeafNode *leaf_node = new LeafNode(bounds[i],
+			                                   visibility[i],
+			                                   start_index,
+			                                   start_index + num);
+			if(alignment_found) {
+				/* Need to recalculate leaf bounds with new alignment. */
+				leaf_node->m_bounds = BoundBox::empty;
+				for(int j = 0; j < num; ++j) {
+					const BVHReference &ref = p_ref[i][j];
+					BoundBox ref_bounds =
+					        unaligned_heuristic.compute_aligned_prim_boundbox(
+					                ref,
+					                aligned_space);
+					leaf_node->m_bounds.grow(ref_bounds);
+				}
+				/* Set alignment space. */
+				leaf_node->set_aligned_space(aligned_space);
 			}
-			leaves[num_leaves++] = new LeafNode(bounds[i],
-			                                    visibility[i],
-			                                    start_index,
-			                                    start_index + num);
+			leaves[num_leaves++] = leaf_node;
 			start_index += num;
 		}
 	}
@@ -765,6 +890,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		++num_leaves;
 	}
 
+	/* TODO(sergey): Need to take care of alignment when number of leaves
+	 * is more than 1.
+	 */
 	if(num_leaves == 1) {
 		/* Simplest case: single leaf, just return it.
 		 * In all the rest cases we'll be creating intermediate inner node with
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index a015b89d72f..64180349935 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -22,6 +22,7 @@
 
 #include "bvh.h"
 #include "bvh_binning.h"
+#include "bvh_unaligned.h"
 
 #include "util_boundbox.h"
 #include "util_task.h"
@@ -59,13 +60,14 @@ protected:
 	friend class BVHSpatialSplit;
 	friend class BVHBuildTask;
 	friend class BVHSpatialSplitBuildTask;
+	friend class BVHObjectBinning;
 
-	/* adding references */
+	/* Adding references. */
 	void add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
 	void add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i);
 	void add_references(BVHRange& root);
 
-	/* building */
+	/* Building. */
 	BVHNode *build_node(const BVHRange& range,
 	                    vector<BVHReference> *references,
 	                    int level,
@@ -78,7 +80,7 @@ protected:
 	bool range_within_max_leaf_size(const BVHRange& range,
 	                                const vector<BVHReference>& references) const;
 
-	/* threads */
+	/* Threads. */
 	enum { THREAD_TASK_SIZE = 4096 };
 	void thread_build_node(InnerNode *node,
 	                       int child,
@@ -92,41 +94,44 @@ protected:
 	                                     int thread_id);
 	thread_mutex build_mutex;
 
-	/* progress */
+	/* Progress. */
 	void progress_update();
 
-	/* tree rotations */
+	/* Tree rotations. */
 	void rotate(BVHNode *node, int max_depth);
 	void rotate(BVHNode *node, int max_depth, int iterations);
 
-	/* objects and primitive references */
+	/* Objects and primitive references. */
 	vector<Object*> objects;
 	vector<BVHReference> references;
 	int num_original_references;
 
-	/* output primitive indexes and objects */
+	/* Output primitive indexes and objects. */
 	array<int>& prim_type;
 	array<int>& prim_index;
 	array<int>& prim_object;
 
-	/* build parameters */
+	/* Build parameters. */
 	BVHParams params;
 
-	/* progress reporting */
+	/* Progress reporting. */
 	Progress& progress;
 	double progress_start_time;
 	size_t progress_count;
 	size_t progress_total;
 	size_t progress_original_total;
 
-	/* spatial splitting */
+	/* Spatial splitting. */
 	float spatial_min_overlap;
 	vector<BVHSpatialStorage> spatial_storage;
 	size_t spatial_free_index;
 	thread_spin_lock spatial_spin_lock;
 
-	/* threads */
+	/* Threads. */
 	TaskPool task_pool;
+
+	/* Unaligned building. */
+	BVHUnaligned unaligned_heuristic;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 8294690da7d..f5cd699bdf4 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -61,6 +61,76 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				}
 			}
 			return cnt;
+		case BVH_STAT_ALIGNED_COUNT:
+			if(!is_unaligned()) {
+				cnt = 1;
+			}
+			break;
+		case BVH_STAT_UNALIGNED_COUNT:
+			if(is_unaligned()) {
+				cnt = 1;
+			}
+			break;
+		case BVH_STAT_ALIGNED_INNER_COUNT:
+			if(!is_leaf()) {
+				bool has_unaligned = false;
+				for(int j = 0; j < num_children(); j++) {
+					has_unaligned |= get_child(j)->is_unaligned();
+				}
+				cnt += has_unaligned? 0: 1;
+			}
+			break;
+		case BVH_STAT_UNALIGNED_INNER_COUNT:
+			if(!is_leaf()) {
+				bool has_unaligned = false;
+				for(int j = 0; j < num_children(); j++) {
+					has_unaligned |= get_child(j)->is_unaligned();
+				}
+				cnt += has_unaligned? 1: 0;
+			}
+			break;
+		case BVH_STAT_ALIGNED_INNER_QNODE_COUNT:
+			{
+				bool has_unaligned = false;
+				for(int i = 0; i < num_children(); i++) {
+					BVHNode *node = get_child(i);
+					if(node->is_leaf()) {
+						has_unaligned |= node->is_unaligned();
+					}
+					else {
+						for(int j = 0; j < node->num_children(); j++) {
+							cnt += node->get_child(j)->getSubtreeSize(stat);
+							has_unaligned |= node->get_child(j)->is_unaligned();
+						}
+					}
+				}
+				cnt += has_unaligned? 0: 1;
+			}
+			return cnt;
+		case BVH_STAT_UNALIGNED_INNER_QNODE_COUNT:
+			{
+				bool has_unaligned = false;
+				for(int i = 0; i < num_children(); i++) {
+					BVHNode *node = get_child(i);
+					if(node->is_leaf()) {
+						has_unaligned |= node->is_unaligned();
+					}
+					else {
+						for(int j = 0; j < node->num_children(); j++) {
+							cnt += node->get_child(j)->getSubtreeSize(stat);
+							has_unaligned |= node->get_child(j)->is_unaligned();
+						}
+					}
+				}
+				cnt += has_unaligned? 1: 0;
+			}
+			return cnt;
+		case BVH_STAT_ALIGNED_LEAF_COUNT:
+			cnt = (is_leaf() && !is_unaligned()) ? 1 : 0;
+			break;
+		case BVH_STAT_UNALIGNED_LEAF_COUNT:
+			cnt = (is_leaf() && is_unaligned()) ? 1 : 0;
+			break;
 		default:
 			assert(0); /* unknown mode */
 	}
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index d476fb917ed..f2965a785e6 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -31,6 +31,14 @@ enum BVH_STAT {
 	BVH_STAT_TRIANGLE_COUNT,
 	BVH_STAT_CHILDNODE_COUNT,
 	BVH_STAT_QNODE_COUNT,
+	BVH_STAT_ALIGNED_COUNT,
+	BVH_STAT_UNALIGNED_COUNT,
+	BVH_STAT_ALIGNED_INNER_COUNT,
+	BVH_STAT_UNALIGNED_INNER_COUNT,
+	BVH_STAT_ALIGNED_INNER_QNODE_COUNT,
+	BVH_STAT_UNALIGNED_INNER_QNODE_COUNT,
+	BVH_STAT_ALIGNED_LEAF_COUNT,
+	BVH_STAT_UNALIGNED_LEAF_COUNT,
 };
 
 class BVHParams;
@@ -38,16 +46,41 @@ class BVHParams;
 class BVHNode
 {
 public:
-	BVHNode()
+	BVHNode() : m_is_unaligned(false),
+	            m_aligned_space(NULL)
 	{
 	}
 
-	virtual ~BVHNode() {}
+	virtual ~BVHNode()
+	{
+		delete m_aligned_space;
+	}
+
 	virtual bool is_leaf() const = 0;
 	virtual int num_children() const = 0;
 	virtual BVHNode *get_child(int i) const = 0;
 	virtual int num_triangles() const { return 0; }
 	virtual void print(int depth = 0) const = 0;
+	bool is_unaligned() const { return m_is_unaligned; }
+
+	inline void set_aligned_space(const Transform& aligned_space)
+	{
+		m_is_unaligned = true;
+		if (m_aligned_space == NULL) {
+			m_aligned_space = new Transform(aligned_space);
+		}
+		else {
+			*m_aligned_space = aligned_space;
+		}
+	}
+
+	inline Transform get_aligned_space() const
+	{
+		if(m_aligned_space == NULL) {
+			return transform_identity();
+		}
+		return *m_aligned_space;
+	}
 
 	BoundBox m_bounds;
 	uint m_visibility;
@@ -58,12 +91,20 @@ public:
 	void deleteSubtree();
 
 	uint update_visibility();
+
+	bool m_is_unaligned;
+
+	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
+	// utilities and type defines in util_transform first.
+	Transform *m_aligned_space;
 };
 
 class InnerNode : public BVHNode
 {
 public:
-	InnerNode(const BoundBox& bounds, BVHNode* child0, BVHNode* child1)
+	InnerNode(const BoundBox& bounds,
+	          BVHNode* child0,
+	          BVHNode* child1)
 	{
 		m_bounds = bounds;
 		children[0] = child0;
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index cf683df1b31..2e698a80742 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -20,6 +20,8 @@
 
 #include "util_boundbox.h"
 
+#include "kernel_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* BVH Parameters */
@@ -31,6 +33,9 @@ public:
 	bool use_spatial_split;
 	float spatial_split_alpha;
 
+	/* Unaligned nodes creation threshold */
+	float unaligned_split_threshold;
+
 	/* SAH costs */
 	float sah_node_cost;
 	float sah_primitive_cost;
@@ -46,6 +51,14 @@ public:
 	/* QBVH */
 	bool use_qbvh;
 
+	/* Mask of primitives to be included into the BVH. */
+	int primitive_mask;
+
+	/* Use unaligned bounding boxes.
+	 * Only used for curves BVH.
+	 */
+	bool use_unaligned_nodes;
+
 	/* fixed parameters */
 	enum {
 		MAX_DEPTH = 64,
@@ -58,6 +71,8 @@ public:
 		use_spatial_split = true;
 		spatial_split_alpha = 1e-5f;
 
+		unaligned_split_threshold = 0.7f;
+
 		/* todo: see if splitting up primitive cost to be separate for triangles
 		 * and curves can help. so far in tests it doesn't help, but why? */
 		sah_node_cost = 1.0f;
@@ -69,6 +84,9 @@ public:
 
 		top_level = false;
 		use_qbvh = false;
+		use_unaligned_nodes = false;
+
+		primitive_mask = PRIMITIVE_ALL;
 	}
 
 	/* SAH costs */
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index e9032c61c3b..e5bcf9995bf 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -26,23 +26,27 @@ CCL_NAMESPACE_BEGIN
 
 static const int BVH_SORT_THRESHOLD = 4096;
 
-/* Silly workaround for float extended precision that happens when compiling
- * on x86, due to one float staying in 80 bit precision register and the other
- * not, which causes the strictly weak ordering to break.
- */
-#if !defined(__i386__)
-#  define NO_EXTENDED_PRECISION
-#else
-#  define NO_EXTENDED_PRECISION volatile
-#endif
-
 struct BVHReferenceCompare {
 public:
 	int dim;
+	const BVHUnaligned *unaligned_heuristic;
+	const Transform *aligned_space;
+
+	BVHReferenceCompare(int dim,
+	                    const BVHUnaligned *unaligned_heuristic,
+	                    const Transform *aligned_space)
+	        : dim(dim),
+	          unaligned_heuristic(unaligned_heuristic),
+	          aligned_space(aligned_space)
+	{
+	}
 
-	explicit BVHReferenceCompare(int dim_)
+	__forceinline BoundBox get_prim_bounds(const BVHReference& prim) const
 	{
-		dim = dim_;
+		return (aligned_space != NULL)
+		        ? unaligned_heuristic->compute_aligned_prim_boundbox(
+		                  prim, *aligned_space)
+		        : prim.bounds();
 	}
 
 	/* Compare two references.
@@ -52,8 +56,10 @@ public:
 	__forceinline int compare(const BVHReference& ra,
 	                          const BVHReference& rb) const
 	{
-		NO_EXTENDED_PRECISION float ca = ra.bounds().min[dim] + ra.bounds().max[dim];
-		NO_EXTENDED_PRECISION float cb = rb.bounds().min[dim] + rb.bounds().max[dim];
+		BoundBox ra_bounds = get_prim_bounds(ra),
+		         rb_bounds = get_prim_bounds(rb);
+		float ca = ra_bounds.min[dim] + ra_bounds.max[dim];
+		float cb = rb_bounds.min[dim] + rb_bounds.max[dim];
 
 		if(ca < cb) return -1;
 		else if(ca > cb) return 1;
@@ -171,10 +177,15 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
 	}
 }
 
-void bvh_reference_sort(int start, int end, BVHReference *data, int dim)
+void bvh_reference_sort(int start,
+                        int end,
+                        BVHReference *data,
+                        int dim,
+                        const BVHUnaligned *unaligned_heuristic,
+                        const Transform *aligned_space)
 {
 	const int count = end - start;
-	BVHReferenceCompare compare(dim);
+	BVHReferenceCompare compare(dim, unaligned_heuristic, aligned_space);
 	if(count < BVH_SORT_THRESHOLD) {
 		/* It is important to not use any mutex if array is small enough,
 		 * otherwise we end up in situation when we're going to sleep far
diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h
index 18aafb5f1ff..b49ca02eb60 100644
--- a/intern/cycles/bvh/bvh_sort.h
+++ b/intern/cycles/bvh/bvh_sort.h
@@ -20,7 +20,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-void bvh_reference_sort(int start, int end, BVHReference *data, int dim);
+class BVHUnaligned;
+struct Transform;
+
+void bvh_reference_sort(int start,
+                        int end,
+                        BVHReference *data,
+                        int dim,
+                        const BVHUnaligned *unaligned_heuristic = NULL,
+                        const Transform *aligned_space = NULL);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index bf68b41021f..d0d5fbe5a7a 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -32,14 +32,18 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
                                BVHSpatialStorage *storage,
                                const BVHRange& range,
                                vector<BVHReference> *references,
-                               float nodeSAH)
+                               float nodeSAH,
+                               const BVHUnaligned *unaligned_heuristic,
+                               const Transform *aligned_space)
 : sah(FLT_MAX),
   dim(0),
   num_left(0),
   left_bounds(BoundBox::empty),
   right_bounds(BoundBox::empty),
   storage_(storage),
-  references_(references)
+  references_(references),
+  unaligned_heuristic_(unaligned_heuristic),
+  aligned_space_(aligned_space)
 {
 	const BVHReference *ref_ptr = &references_->at(range.start());
 	float min_sah = FLT_MAX;
@@ -51,12 +55,15 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
 		bvh_reference_sort(range.start(),
 		                   range.end(),
 		                   &references_->at(0),
-		                   dim);
+		                   dim,
+		                   unaligned_heuristic_,
+		                   aligned_space_);
 
 		/* sweep right to left and determine bounds. */
 		BoundBox right_bounds = BoundBox::empty;
 		for(int i = range.size() - 1; i > 0; i--) {
-			right_bounds.grow(ref_ptr[i].bounds());
+			BoundBox prim_bounds = get_prim_bounds(ref_ptr[i]);
+			right_bounds.grow(prim_bounds);
 			storage_->right_bounds[i - 1] = right_bounds;
 		}
 
@@ -64,7 +71,8 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
 		BoundBox left_bounds = BoundBox::empty;
 
 		for(int i = 1; i < range.size(); i++) {
-			left_bounds.grow(ref_ptr[i - 1].bounds());
+			BoundBox prim_bounds = get_prim_bounds(ref_ptr[i - 1]);
+			left_bounds.grow(prim_bounds);
 			right_bounds = storage_->right_bounds[i - 1];
 
 			float sah = nodeSAH +
@@ -88,16 +96,37 @@ void BVHObjectSplit::split(BVHRange& left,
                            BVHRange& right,
                            const BVHRange& range)
 {
+	assert(references_->size() > 0);
 	/* sort references according to split */
 	bvh_reference_sort(range.start(),
 	                   range.end(),
 	                   &references_->at(0),
-	                   this->dim);
+	                   this->dim,
+	                   unaligned_heuristic_,
+	                   aligned_space_);
+
+	BoundBox effective_left_bounds, effective_right_bounds;
+	const int num_right = range.size() - this->num_left;
+	if(aligned_space_ == NULL) {
+		effective_left_bounds = left_bounds;
+		effective_right_bounds = right_bounds;
+	}
+	else {
+		effective_left_bounds = BoundBox::empty;
+		effective_right_bounds = BoundBox::empty;
+		for(int i = 0; i < this->num_left; ++i) {
+			BoundBox prim_boundbox = references_->at(range.start() + i).bounds();
+			effective_left_bounds.grow(prim_boundbox);
+		}
+		for(int i = 0; i < num_right; ++i) {
+			BoundBox prim_boundbox = references_->at(range.start() + this->num_left + i).bounds();
+			effective_right_bounds.grow(prim_boundbox);
+		}
+	}
 
 	/* split node ranges */
-	left = BVHRange(this->left_bounds, range.start(), this->num_left);
-	right = BVHRange(this->right_bounds, left.end(), range.size() - this->num_left);
-
+	left = BVHRange(effective_left_bounds, range.start(), this->num_left);
+	right = BVHRange(effective_right_bounds, left.end(), num_right);
 }
 
 /* Spatial Split */
@@ -106,16 +135,31 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder,
                                  BVHSpatialStorage *storage,
                                  const BVHRange& range,
                                  vector<BVHReference> *references,
-                                 float nodeSAH)
+                                 float nodeSAH,
+                                 const BVHUnaligned *unaligned_heuristic,
+                                 const Transform *aligned_space)
 : sah(FLT_MAX),
   dim(0),
   pos(0.0f),
   storage_(storage),
-  references_(references)
+  references_(references),
+  unaligned_heuristic_(unaligned_heuristic),
+  aligned_space_(aligned_space)
 {
 	/* initialize bins. */
-	float3 origin = range.bounds().min;
-	float3 binSize = (range.bounds().max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS);
+	BoundBox range_bounds;
+	if(aligned_space == NULL) {
+		range_bounds = range.bounds();
+	}
+	else {
+		range_bounds = unaligned_heuristic->compute_aligned_boundbox(
+		        range,
+		        &references->at(0),
+		        *aligned_space);
+	}
+
+	float3 origin = range_bounds.min;
+	float3 binSize = (range_bounds.max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS);
 	float3 invBinSize = 1.0f / binSize;
 
 	for(int dim = 0; dim < 3; dim++) {
@@ -131,8 +175,9 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder,
 	/* chop references into bins. */
 	for(unsigned int refIdx = range.start(); refIdx < range.end(); refIdx++) {
 		const BVHReference& ref = references_->at(refIdx);
-		float3 firstBinf = (ref.bounds().min - origin) * invBinSize;
-		float3 lastBinf = (ref.bounds().max - origin) * invBinSize;
+		BoundBox prim_bounds = get_prim_bounds(ref);
+		float3 firstBinf = (prim_bounds.min - origin) * invBinSize;
+		float3 lastBinf = (prim_bounds.max - origin) * invBinSize;
 		int3 firstBin = make_int3((int)firstBinf.x, (int)firstBinf.y, (int)firstBinf.z);
 		int3 lastBin = make_int3((int)lastBinf.x, (int)lastBinf.y, (int)lastBinf.z);
 
@@ -140,7 +185,10 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder,
 		lastBin = clamp(lastBin, firstBin, BVHParams::NUM_SPATIAL_BINS - 1);
 
 		for(int dim = 0; dim < 3; dim++) {
-			BVHReference currRef = ref;
+			BVHReference currRef(get_prim_bounds(ref),
+			                     ref.prim_index(),
+			                     ref.prim_object(),
+			                     ref.prim_type());
 
 			for(int i = firstBin[dim]; i < lastBin[dim]; i++) {
 				BVHReference leftRef, rightRef;
@@ -209,14 +257,15 @@ void BVHSpatialSplit::split(BVHBuild *builder,
 	BoundBox right_bounds = BoundBox::empty;
 
 	for(int i = left_end; i < right_start; i++) {
-		if(refs[i].bounds().max[this->dim] <= this->pos) {
+		BoundBox prim_bounds = get_prim_bounds(refs[i]);
+		if(prim_bounds.max[this->dim] <= this->pos) {
 			/* entirely on the left-hand side */
-			left_bounds.grow(refs[i].bounds());
+			left_bounds.grow(prim_bounds);
 			swap(refs[i], refs[left_end++]);
 		}
-		else if(refs[i].bounds().min[this->dim] >= this->pos) {
+		else if(prim_bounds.min[this->dim] >= this->pos) {
 			/* entirely on the right-hand side */
-			right_bounds.grow(refs[i].bounds());
+			right_bounds.grow(prim_bounds);
 			swap(refs[i--], refs[--right_start]);
 		}
 	}
@@ -231,8 +280,12 @@ void BVHSpatialSplit::split(BVHBuild *builder,
 	new_refs.reserve(right_start - left_end);
 	while(left_end < right_start) {
 		/* split reference. */
+		BVHReference curr_ref(get_prim_bounds(refs[left_end]),
+		                      refs[left_end].prim_index(),
+		                      refs[left_end].prim_object(),
+		                      refs[left_end].prim_type());
 		BVHReference lref, rref;
-		split_reference(*builder, lref, rref, refs[left_end], this->dim, this->pos);
+		split_reference(*builder, lref, rref, curr_ref, this->dim, this->pos);
 
 		/* compute SAH for duplicate/unsplit candidates. */
 		BoundBox lub = left_bounds;		// Unsplit to left:		new left-hand bounds.
@@ -240,8 +293,8 @@ void BVHSpatialSplit::split(BVHBuild *builder,
 		BoundBox ldb = left_bounds;		// Duplicate:			new left-hand bounds.
 		BoundBox rdb = right_bounds;	// Duplicate:			new right-hand bounds.
 
-		lub.grow(refs[left_end].bounds());
-		rub.grow(refs[left_end].bounds());
+		lub.grow(curr_ref.bounds());
+		rub.grow(curr_ref.bounds());
 		ldb.grow(lref.bounds());
 		rdb.grow(rref.bounds());
 
@@ -280,6 +333,17 @@ void BVHSpatialSplit::split(BVHBuild *builder,
 		            new_refs.begin(),
 		            new_refs.end());
 	}
+	if(aligned_space_ != NULL) {
+		left_bounds = right_bounds = BoundBox::empty;
+		for(int i = left_start; i < left_end - left_start; ++i) {
+			BoundBox prim_boundbox = references_->at(i).bounds();
+			left_bounds.grow(prim_boundbox);
+		}
+		for(int i = right_start; i < right_end - right_start; ++i) {
+			BoundBox prim_boundbox = references_->at(i).bounds();
+			right_bounds.grow(prim_boundbox);
+		}
+	}
 	left = BVHRange(left_bounds, left_start, left_end - left_start);
 	right = BVHRange(right_bounds, right_start, right_end - right_start);
 }
@@ -295,11 +359,13 @@ void BVHSpatialSplit::split_triangle_primitive(const Mesh *mesh,
 	Mesh::Triangle t = mesh->get_triangle(prim_index);
 	const float3 *verts = &mesh->verts[0];
 	float3 v1 = tfm ? transform_point(tfm, verts[t.v[2]]) : verts[t.v[2]];
+	v1 = get_unaligned_point(v1);
 
 	for(int i = 0; i < 3; i++) {
 		float3 v0 = v1;
 		int vindex = t.v[i];
 		v1 = tfm ? transform_point(tfm, verts[vindex]) : verts[vindex];
+		v1 = get_unaligned_point(v1);
 		float v0p = v0[dim];
 		float v1p = v1[dim];
 
@@ -339,6 +405,8 @@ void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
 		v0 = transform_point(tfm, v0);
 		v1 = transform_point(tfm, v1);
 	}
+	v0 = get_unaligned_point(v0);
+	v1 = get_unaligned_point(v1);
 
 	float v0p = v0[dim];
 	float v1p = v1[dim];
@@ -473,6 +541,7 @@ void BVHSpatialSplit::split_reference(const BVHBuild& builder,
 	/* intersect with original bounds. */
 	left_bounds.max[dim] = pos;
 	right_bounds.min[dim] = pos;
+
 	left_bounds.intersect(ref.bounds());
 	right_bounds.intersect(ref.bounds());
 
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index aea8b2565e0..dbdb51f1a5b 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -24,6 +24,7 @@
 CCL_NAMESPACE_BEGIN
 
 class BVHBuild;
+struct Transform;
 
 /* Object Split */
 
@@ -41,7 +42,9 @@ public:
 	               BVHSpatialStorage *storage,
 	               const BVHRange& range,
 	               vector<BVHReference> *references,
-	               float nodeSAH);
+	               float nodeSAH,
+	               const BVHUnaligned *unaligned_heuristic = NULL,
+	               const Transform *aligned_space = NULL);
 
 	void split(BVHRange& left,
 	           BVHRange& right,
@@ -50,6 +53,19 @@ public:
 protected:
 	BVHSpatialStorage *storage_;
 	vector<BVHReference> *references_;
+	const BVHUnaligned *unaligned_heuristic_;
+	const Transform *aligned_space_;
+
+	__forceinline BoundBox get_prim_bounds(const BVHReference& prim) const
+	{
+		if(aligned_space_ == NULL) {
+			return prim.bounds();
+		}
+		else {
+			return unaligned_heuristic_->compute_aligned_prim_boundbox(
+			        prim, *aligned_space_);
+		}
+	}
 };
 
 /* Spatial Split */
@@ -70,7 +86,9 @@ public:
 	                BVHSpatialStorage *storage,
 	                const BVHRange& range,
 	                vector<BVHReference> *references,
-	                float nodeSAH);
+	                float nodeSAH,
+	                const BVHUnaligned *unaligned_heuristic = NULL,
+	                const Transform *aligned_space = NULL);
 
 	void split(BVHBuild *builder,
 	           BVHRange& left,
@@ -87,6 +105,8 @@ public:
 protected:
 	BVHSpatialStorage *storage_;
 	vector<BVHReference> *references_;
+	const BVHUnaligned *unaligned_heuristic_;
+	const Transform *aligned_space_;
 
 	/* Lower-level functions which calculates boundaries of left and right nodes
 	 * needed for spatial split.
@@ -132,6 +152,27 @@ protected:
 	                            float pos,
 	                            BoundBox& left_bounds,
 	                            BoundBox& right_bounds);
+
+	__forceinline BoundBox get_prim_bounds(const BVHReference& prim) const
+	{
+		if(aligned_space_ == NULL) {
+			return prim.bounds();
+		}
+		else {
+			return unaligned_heuristic_->compute_aligned_prim_boundbox(
+			        prim, *aligned_space_);
+		}
+	}
+
+	__forceinline float3 get_unaligned_point(const float3& point) const
+	{
+		if(aligned_space_ == NULL) {
+			return point;
+		}
+		else {
+			return transform_point(aligned_space_, point);
+		}
+	}
 };
 
 /* Mixed Object-Spatial Split */
@@ -148,19 +189,40 @@ public:
 
 	bool no_split;
 
+	BoundBox bounds;
+
+	BVHMixedSplit() {}
+
 	__forceinline BVHMixedSplit(BVHBuild *builder,
 	                            BVHSpatialStorage *storage,
 	                            const BVHRange& range,
 	                            vector<BVHReference> *references,
-	                            int level)
+	                            int level,
+	                            const BVHUnaligned *unaligned_heuristic = NULL,
+	                            const Transform *aligned_space = NULL)
 	{
+		if(aligned_space == NULL) {
+			bounds = range.bounds();
+		}
+		else {
+			bounds = unaligned_heuristic->compute_aligned_boundbox(
+			        range,
+			        &references->at(0),
+			        *aligned_space);
+		}
 		/* find split candidates. */
-		float area = range.bounds().safe_area();
+		float area = bounds.safe_area();
 
 		leafSAH = area * builder->params.primitive_cost(range.size());
 		nodeSAH = area * builder->params.node_cost(2);
 
-		object = BVHObjectSplit(builder, storage, range, references, nodeSAH);
+		object = BVHObjectSplit(builder,
+		                        storage,
+		                        range,
+		                        references,
+		                        nodeSAH,
+		                        unaligned_heuristic,
+		                        aligned_space);
 
 		if(builder->params.use_spatial_split && level < BVHParams::MAX_SPATIAL_DEPTH) {
 			BoundBox overlap = object.left_bounds;
@@ -171,7 +233,9 @@ public:
 				                          storage,
 				                          range,
 				                          references,
-				                          nodeSAH);
+				                          nodeSAH,
+				                          unaligned_heuristic,
+				                          aligned_space);
 			}
 		}
 
@@ -181,7 +245,10 @@ public:
 		            builder->range_within_max_leaf_size(range, *references));
 	}
 
-	__forceinline void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range)
+	__forceinline void split(BVHBuild *builder,
+	                         BVHRange& left,
+	                         BVHRange& right,
+	                         const BVHRange& range)
 	{
 		if(builder->params.use_spatial_split && minSAH == spatial.sah)
 			spatial.split(builder, left, right, range);
@@ -193,4 +260,3 @@ public:
 CCL_NAMESPACE_END
 
 #endif /* __BVH_SPLIT_H__ */
-
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
new file mode 100644
index 00000000000..a876c670914
--- /dev/null
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "bvh_unaligned.h"
+
+#include "mesh.h"
+#include "object.h"
+
+#include "bvh_binning.h"
+#include "bvh_params.h"
+
+#include "util_boundbox.h"
+#include "util_debug.h"
+#include "util_transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+
+BVHUnaligned::BVHUnaligned(const vector<Object*>& objects)
+        : objects_(objects)
+{
+}
+
+Transform BVHUnaligned::compute_aligned_space(
+        const BVHObjectBinning& range,
+        const BVHReference *references) const
+{
+	for(int i = range.start(); i < range.end(); ++i) {
+		const BVHReference& ref = references[i];
+		Transform aligned_space;
+		/* Use first primitive which defines correct direction to define
+		 * the orientation space.
+		 */
+		if(compute_aligned_space(ref, &aligned_space)) {
+			return aligned_space;
+		}
+	}
+	return transform_identity();
+}
+
+Transform BVHUnaligned::compute_aligned_space(
+        const BVHRange& range,
+        const BVHReference *references) const
+{
+	for(int i = range.start(); i < range.end(); ++i) {
+		const BVHReference& ref = references[i];
+		Transform aligned_space;
+		/* Use first primitive which defines correct direction to define
+		 * the orientation space.
+		 */
+		if(compute_aligned_space(ref, &aligned_space)) {
+			return aligned_space;
+		}
+	}
+	return transform_identity();
+}
+
+bool BVHUnaligned::compute_aligned_space(const BVHReference& ref,
+                                         Transform *aligned_space) const
+{
+	const Object *object = objects_[ref.prim_object()];
+	const int packed_type = ref.prim_type();
+	const int type = (packed_type & PRIMITIVE_ALL);
+	if(type & PRIMITIVE_CURVE) {
+		const int curve_index = ref.prim_index();
+		const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
+		const Mesh *mesh = object->mesh;
+		const Mesh::Curve& curve = mesh->get_curve(curve_index);
+		const int key = curve.first_key + segment;
+		const float3 v1 = mesh->curve_keys[key],
+		             v2 = mesh->curve_keys[key + 1];
+		float length;
+		const float3 axis = normalize_len(v2 - v1, &length);
+		if(length > 1e-6f) {
+			*aligned_space = make_transform_frame(axis);
+			return true;
+		}
+	}
+	*aligned_space = transform_identity();
+	return false;
+}
+
+BoundBox BVHUnaligned::compute_aligned_prim_boundbox(
+        const BVHReference& prim,
+        const Transform& aligned_space) const
+{
+	BoundBox bounds = BoundBox::empty;
+	const Object *object = objects_[prim.prim_object()];
+	const int packed_type = prim.prim_type();
+	const int type = (packed_type & PRIMITIVE_ALL);
+	if(type & PRIMITIVE_CURVE) {
+		const int curve_index = prim.prim_index();
+		const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
+		const Mesh *mesh = object->mesh;
+		const Mesh::Curve& curve = mesh->get_curve(curve_index);
+		curve.bounds_grow(segment,
+		                  &mesh->curve_keys[0],
+		                  &mesh->curve_radius[0],
+		                  aligned_space,
+		                  bounds);
+	}
+	else {
+		bounds = prim.bounds().transformed(&aligned_space);
+	}
+	return bounds;
+}
+
+BoundBox BVHUnaligned::compute_aligned_boundbox(
+        const BVHObjectBinning& range,
+        const BVHReference *references,
+        const Transform& aligned_space,
+        BoundBox *cent_bounds) const
+{
+	BoundBox bounds = BoundBox::empty;
+	if(cent_bounds != NULL) {
+		*cent_bounds = BoundBox::empty;
+	}
+	for(int i = range.start(); i < range.end(); ++i) {
+		const BVHReference& ref = references[i];
+		BoundBox ref_bounds = compute_aligned_prim_boundbox(ref, aligned_space);
+		bounds.grow(ref_bounds);
+		if(cent_bounds != NULL) {
+			cent_bounds->grow(ref_bounds.center2());
+		}
+	}
+	return bounds;
+}
+
+BoundBox BVHUnaligned::compute_aligned_boundbox(
+        const BVHRange& range,
+        const BVHReference *references,
+        const Transform& aligned_space,
+        BoundBox *cent_bounds) const
+{
+	BoundBox bounds = BoundBox::empty;
+	if(cent_bounds != NULL) {
+		*cent_bounds = BoundBox::empty;
+	}
+	for(int i = range.start(); i < range.end(); ++i) {
+		const BVHReference& ref = references[i];
+		BoundBox ref_bounds = compute_aligned_prim_boundbox(ref, aligned_space);
+		bounds.grow(ref_bounds);
+		if(cent_bounds != NULL) {
+			cent_bounds->grow(ref_bounds.center2());
+		}
+	}
+	return bounds;
+}
+
+Transform BVHUnaligned::compute_node_transform(
+        const BoundBox& bounds,
+        const Transform& aligned_space)
+{
+	Transform space = aligned_space;
+	space.x.w -= bounds.min.x;
+	space.y.w -= bounds.min.y;
+	space.z.w -= bounds.min.z;
+	float3 dim = bounds.max - bounds.min;
+	return transform_scale(1.0f / max(1e-18f, dim.x),
+	                       1.0f / max(1e-18f, dim.y),
+	                       1.0f / max(1e-18f, dim.z)) * space;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
new file mode 100644
index 00000000000..4d0872f4a39
--- /dev/null
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_UNALIGNED_H__
+#define __BVH_UNALIGNED_H__
+
+#include "util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BoundBox;
+class BVHObjectBinning;
+class BVHRange;
+class BVHReference;
+struct Transform;
+class Object;
+
+/* Helper class to perform calculations needed for unaligned nodes. */
+class BVHUnaligned {
+public:
+	BVHUnaligned(const vector<Object*>& objects);
+
+	/* Calculate alignment for the oriented node for a given range. */
+	Transform compute_aligned_space(
+	        const BVHObjectBinning& range,
+	        const BVHReference *references) const;
+	Transform compute_aligned_space(
+	        const BVHRange& range,
+	        const BVHReference *references) const;
+
+	/* Calculate alignment for the oriented node for a given reference.
+	 *
+	 * Return true when space was calculated successfully.
+	 */
+	bool compute_aligned_space(const BVHReference& ref,
+	                           Transform *aligned_space) const;
+
+	/* Calculate primitive's bounding box in given space. */
+	BoundBox compute_aligned_prim_boundbox(
+	        const BVHReference& prim,
+	        const Transform& aligned_space) const;
+
+	/* Calculate bounding box in given space. */
+	BoundBox compute_aligned_boundbox(
+	        const BVHObjectBinning& range,
+	        const BVHReference *references,
+	        const Transform& aligned_space,
+	        BoundBox *cent_bounds = NULL) const;
+	BoundBox compute_aligned_boundbox(
+	        const BVHRange& range,
+	        const BVHReference *references,
+	        const Transform& aligned_space,
+	        BoundBox *cent_bounds = NULL) const;
+
+	/* Calculate affine transform for node packing.
+	 * Bounds will be in the range of 0..1.
+	 */
+	static Transform compute_node_transform(const BoundBox& bounds,
+	                                        const Transform& aligned_space);
+protected:
+	/* List of objects BVH is being created for. */
+	const vector<Object*>& objects_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH_UNALIGNED_H__ */
+
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index f0adbc03e22..bd3969b2889 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -28,6 +28,22 @@ set(SRC
 	kernels/cuda/kernel.cu
 )
 
+set(SRC_BVH_HEADERS
+	bvh/bvh.h
+	bvh/bvh_nodes.h
+	bvh/bvh_shadow_all.h
+	bvh/bvh_subsurface.h
+	bvh/bvh_traversal.h
+	bvh/bvh_volume.h
+	bvh/bvh_volume_all.h
+	bvh/qbvh_nodes.h
+	bvh/qbvh_shadow_all.h
+	bvh/qbvh_subsurface.h
+	bvh/qbvh_traversal.h
+	bvh/qbvh_volume.h
+	bvh/qbvh_volume_all.h
+)
+
 set(SRC_HEADERS
 	kernel_accumulate.h
 	kernel_bake.h
@@ -140,23 +156,11 @@ set(SRC_SVM_HEADERS
 set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
-	geom/geom_bvh.h
-	geom/geom_bvh_shadow.h
-	geom/geom_bvh_subsurface.h
-	geom/geom_bvh_traversal.h
-	geom/geom_bvh_volume.h
-	geom/geom_bvh_volume_all.h
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_object.h
 	geom/geom_primitive.h
-	geom/geom_qbvh.h
-	geom/geom_qbvh_shadow.h
-	geom/geom_qbvh_subsurface.h
-	geom/geom_qbvh_traversal.h
-	geom/geom_qbvh_volume.h
-	geom/geom_qbvh_volume_all.h
 	geom/geom_triangle.h
 	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
@@ -212,7 +216,14 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernels/cuda/kernel.cu
+		${SRC_HEADERS}
+		${SRC_BVH_HEADERS}
+		${SRC_SVM_HEADERS}
+		${SRC_GEOM_HEADERS}
+		${SRC_CLOSURE_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)
 
 	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
@@ -312,6 +323,7 @@ add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
@@ -346,6 +358,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteratio
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/bvh/bvh.h
index d0eedd3396a..59881738195 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -35,6 +35,13 @@ CCL_NAMESPACE_BEGIN
 #  define ccl_device_intersect ccl_device_inline
 #endif
 
+/* bottom-most stack entry, indicating the end of traversal */
+#define ENTRYPOINT_SENTINEL 0x76543210
+
+/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
+#define BVH_STACK_SIZE 192
+#define BVH_QSTACK_SIZE 384
+
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING			1
@@ -72,71 +79,73 @@ CCL_NAMESPACE_BEGIN
 
 /* Common QBVH functions. */
 #ifdef __QBVH__
-#  include "geom_qbvh.h"
+#  include "qbvh_nodes.h"
 #endif
 
 /* Regular BVH traversal */
 
+#include "bvh_nodes.h"
+
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
-#include "geom_bvh_traversal.h"
+#include "bvh_traversal.h"
 
 #if defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "geom_bvh_traversal.h"
+#  include "bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "geom_bvh_traversal.h"
+#  include "bvh_traversal.h"
 #endif
 
 #if defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "geom_bvh_traversal.h"
+#  include "bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "geom_bvh_traversal.h"
+#  include "bvh_traversal.h"
 #endif
 
 /* Subsurface scattering BVH traversal */
 
 #if defined(__SUBSURFACE__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
-#  define BVH_FUNCTION_FEATURES 0
-#  include "geom_bvh_subsurface.h"
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
+#  include "bvh_subsurface.h"
 #endif
 
 #if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-#  define BVH_FUNCTION_FEATURES BVH_MOTION
-#  include "geom_bvh_subsurface.h"
+#  define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
+#  include "bvh_subsurface.h"
 #endif
 
 /* Volume BVH traversal */
 
 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
-#  define BVH_FUNCTION_FEATURES 0
-#  include "geom_bvh_volume.h"
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
+#  include "bvh_volume.h"
 #endif
 
 #if defined(__VOLUME__) && defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "geom_bvh_volume.h"
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#  include "bvh_volume.h"
 #endif
 
 #if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "geom_bvh_volume.h"
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+#  include "bvh_volume.h"
 #endif
 
 /* Record all intersections - Shadow BVH traversal */
@@ -144,51 +153,51 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SHADOW_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
-#  include "geom_bvh_shadow.h"
+#  include "bvh_shadow_all.h"
 #endif
 
 #if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "geom_bvh_shadow.h"
+#  include "bvh_shadow_all.h"
 #endif
 
 #if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "geom_bvh_shadow.h"
+#  include "bvh_shadow_all.h"
 #endif
 
 #if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "geom_bvh_shadow.h"
+#  include "bvh_shadow_all.h"
 #endif
 
 #if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#  include "geom_bvh_shadow.h"
+#  include "bvh_shadow_all.h"
 #endif
 
 /* Record all intersections - Volume BVH traversal  */
 
 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
-#  define BVH_FUNCTION_FEATURES 0
-#  include "geom_bvh_volume_all.h"
+#  define BVH_FUNCTION_FEATURES BVH_HAIR
+#  include "bvh_volume_all.h"
 #endif
 
 #if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "geom_bvh_volume_all.h"
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#  include "bvh_volume_all.h"
 #endif
 
 #if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "geom_bvh_volume_all.h"
+#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+#  include "bvh_volume_all.h"
 #endif
 
 #undef BVH_FEATURE
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
new file mode 100644
index 00000000000..db2275b0ff8
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright 2011-2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
+// 3-vector which might be faster.
+ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+                                                           int node_addr,
+                                                           int child)
+{
+	Transform space;
+	const int child_addr = node_addr + child * 3;
+	space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
+	space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
+	space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
+	space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+	return space;
+}
+
+#if !defined(__KERNEL_SSE2__)
+ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+                                                 const float3 P,
+                                                 const float3 idir,
+                                                 const float t,
+                                                 const int node_addr,
+                                                 const uint visibility,
+                                                 float dist[2])
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+                                                        const float3 P,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const float difl,
+                                                        const float extmax,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
+{
+
+	/* fetch node data */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
+	float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
+	float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
+
+	/* intersect ray against child nodes */
+	float c0lox = (node0.x - P.x) * idir.x;
+	float c0hix = (node0.z - P.x) * idir.x;
+	float c0loy = (node1.x - P.y) * idir.y;
+	float c0hiy = (node1.z - P.y) * idir.y;
+	float c0loz = (node2.x - P.z) * idir.z;
+	float c0hiz = (node2.z - P.z) * idir.z;
+	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+	float c1lox = (node0.y - P.x) * idir.x;
+	float c1hix = (node0.w - P.x) * idir.x;
+	float c1loy = (node1.y - P.y) * idir.y;
+	float c1hiy = (node1.w - P.y) * idir.y;
+	float c1loz = (node2.y - P.z) * idir.z;
+	float c1hiz = (node2.w - P.z) * idir.z;
+	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+	if(difl != 0.0f) {
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	dist[0] = c0min;
+	dist[1] = c1min;
+
+#ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	       (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+#else
+	return ((c0max >= c0min)? 1: 0) |
+	       ((c1max >= c1min)? 2: 0);
+#endif
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        int node_addr,
+        int child,
+        float dist[2])
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, node_addr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 lower_xyz = aligned_P * nrdir;
+	float3 upper_xyz = lower_xyz - nrdir;
+	const float near_x = min(lower_xyz.x, upper_xyz.x);
+	const float near_y = min(lower_xyz.y, upper_xyz.y);
+	const float near_z = min(lower_xyz.z, upper_xyz.z);
+	const float far_x  = max(lower_xyz.x, upper_xyz.x);
+	const float far_y  = max(lower_xyz.y, upper_xyz.y);
+	const float far_z  = max(lower_xyz.z, upper_xyz.z);
+	const float tnear   = max4(0.0f, near_x, near_y, near_z);
+	const float tfar    = min4(t, far_x, far_y, far_z);
+	*dist = tnear;
+	return tnear <= tfar;
+}
+
+ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+        KernelGlobals *kg,
+        const float3 P,
+        const float3 dir,
+        const float t,
+        const float difl,
+        int node_addr,
+        int child,
+        float dist[2])
+{
+	Transform space  = bvh_unaligned_node_fetch_space(kg, node_addr, child);
+	float3 aligned_dir = transform_direction(&space, dir);
+	float3 aligned_P = transform_point(&space, P);
+	float3 nrdir = -bvh_inverse_direction(aligned_dir);
+	float3 tLowerXYZ = aligned_P * nrdir;
+	float3 tUpperXYZ = tLowerXYZ - nrdir;
+	const float near_x = min(tLowerXYZ.x, tUpperXYZ.x);
+	const float near_y = min(tLowerXYZ.y, tUpperXYZ.y);
+	const float near_z = min(tLowerXYZ.z, tUpperXYZ.z);
+	const float far_x  = max(tLowerXYZ.x, tUpperXYZ.x);
+	const float far_y  = max(tLowerXYZ.y, tUpperXYZ.y);
+	const float far_z  = max(tLowerXYZ.z, tUpperXYZ.z);
+	const float tnear   = max4(0.0f, near_x, near_y, near_z);
+	const float tfar    = min4(t, far_x, far_y, far_z);
+	*dist = tnear;
+	if(difl != 0.0f) {
+		/* TODO(sergey): Same as for QBVH, needs a proper use. */
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		return round_down*tnear <= round_up*tfar;
+	}
+	else {
+		return tnear <= tfar;
+	}
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const float3 idir,
+                                                   const float t,
+                                                   const int node_addr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const float3 idir,
+                                                          const float t,
+                                                          const float difl,
+                                                          const float extmax,
+                                                          const int node_addr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	int mask = 0;
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.x) & visibility))
+#endif
+		{
+			mask |= 1;
+		}
+	}
+	if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) {
+#ifdef __VISIBILITY_FLAG__
+		if((__float_as_uint(cnodes.y) & visibility))
+#endif
+		{
+			mask |= 2;
+		}
+	}
+	return mask;
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3 P,
+                                         const float3 dir,
+                                         const float3 idir,
+                                         const float t,
+                                         const int node_addr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    idir,
+		                                    t,
+		                                    node_addr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  idir,
+		                                  t,
+		                                  node_addr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3 P,
+                                                const float3 dir,
+                                                const float3 idir,
+                                                const float t,
+                                                const float difl,
+                                                const float extmax,
+                                                const int node_addr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           idir,
+		                                           t,
+		                                           difl,
+		                                           extmax,
+		                                           node_addr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         idir,
+		                                         t,
+		                                         difl,
+		                                         extmax,
+		                                         node_addr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#else  /* !defined(__KERNEL_SSE2__) */
+
+int ccl_device_inline bvh_aligned_node_intersect(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const int node_addr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_aligned_node_intersect_robust(
+        KernelGlobals *kg,
+        const float3& P,
+        const float3& dir,
+        const ssef& tsplat,
+        const ssef Psplat[3],
+        const ssef idirsplat[3],
+        const shuffle_swap_t shufflexyz[3],
+        const float difl,
+        const float extmax,
+        const int nodeAddr,
+        const uint visibility,
+        float dist[2])
+{
+	/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+
+	/* fetch node data */
+	const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
+
+	/* intersect ray against child nodes */
+	const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+	const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+	const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+	/* calculate { c0min, c1min, -c0max, -c1max} */
+	ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+	const ssef tminmax = minmax ^ pn;
+
+	if(difl != 0.0f) {
+		float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+		float4 *tminmaxview = (float4*)&tminmax;
+		float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
+		float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
+		float hdiff = 1.0f + difl;
+		float ldiff = 1.0f - difl;
+		if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
+			c0min = max(ldiff * c0min, c0min - extmax);
+			c0max = min(hdiff * c0max, c0max + extmax);
+		}
+		if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
+			c1min = max(ldiff * c1min, c1min - extmax);
+			c1max = min(hdiff * c1max, c1max + extmax);
+		}
+	}
+
+	const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+	dist[0] = tminmax[0];
+	dist[1] = tminmax[1];
+
+	int mask = movemask(lrhit);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+                                                   const float3 P,
+                                                   const float3 dir,
+                                                   const ssef& isect_near,
+                                                   const ssef& isect_far,
+                                                   const int node_addr,
+                                                   const uint visibility,
+                                                   float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     lower_y = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     lower_z = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(lower_x, upper_x);
+	ssef tnear_y = min(lower_y, upper_y);
+	ssef tnear_z = min(lower_z, upper_z);
+	ssef tfar_x = max(lower_x, upper_x);
+	ssef tfar_y = max(lower_y, upper_y);
+	ssef tfar_z = max(lower_z, upper_z);
+
+	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	sseb vmask = tnear <= tfar;
+	dist[0] = tnear.f[0];
+	dist[1] = tnear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+                                                          const float3 P,
+                                                          const float3 dir,
+                                                          const ssef& isect_near,
+                                                          const ssef& isect_far,
+                                                          const float difl,
+                                                          const int node_addr,
+                                                          const uint visibility,
+                                                          float dist[2])
+{
+	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
+	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
+
+	float3 aligned_dir0 = transform_direction(&space0, dir),
+	       aligned_dir1 = transform_direction(&space1, dir);;
+	float3 aligned_P0 = transform_point(&space0, P),
+	       aligned_P1 = transform_point(&space1, P);
+	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
+	       nrdir1 = -bvh_inverse_direction(aligned_dir1);
+
+	ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
+	                    aligned_P1.x * nrdir1.x,
+	                    0.0f, 0.0f),
+	     lower_y = ssef(aligned_P0.y * nrdir0.y,
+	                    aligned_P1.y * nrdir1.y,
+	                    0.0f,
+	                    0.0f),
+	     lower_z = ssef(aligned_P0.z * nrdir0.z,
+	                    aligned_P1.z * nrdir1.z,
+	                    0.0f,
+	                    0.0f);
+
+	ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
+	     upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
+	     upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
+
+	ssef tnear_x = min(lower_x, upper_x);
+	ssef tnear_y = min(lower_y, upper_y);
+	ssef tnear_z = min(lower_z, upper_z);
+	ssef tfar_x = max(lower_x, upper_x);
+	ssef tfar_y = max(lower_y, upper_y);
+	ssef tfar_z = max(lower_z, upper_z);
+
+	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	sseb vmask;
+	if(difl != 0.0f) {
+		const float round_down = 1.0f - difl;
+		const float round_up = 1.0f + difl;
+		vmask = round_down*tnear <= round_up*tfar;
+	}
+	else {
+		vmask = tnear <= tfar;
+	}
+
+	dist[0] = tnear.f[0];
+	dist[1] = tnear.f[1];
+
+	int mask = (int)movemask(vmask);
+
+#  ifdef __VISIBILITY_FLAG__
+	/* this visibility test gives a 5% performance hit, how to solve? */
+	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+	int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
+	            (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
+	return cmask;
+#  else
+	return mask & 3;
+#  endif
+}
+
+ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+                                         const float3& P,
+                                         const float3& dir,
+                                         const ssef& isect_near,
+                                         const ssef& isect_far,
+                                         const ssef& tsplat,
+                                         const ssef Psplat[3],
+                                         const ssef idirsplat[3],
+                                         const shuffle_swap_t shufflexyz[3],
+                                         const int node_addr,
+                                         const uint visibility,
+                                         float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect(kg,
+		                                    P,
+		                                    dir,
+		                                    isect_near,
+		                                    isect_far,
+		                                    node_addr,
+		                                    visibility,
+		                                    dist);
+	}
+	else {
+		return bvh_aligned_node_intersect(kg,
+		                                  P,
+		                                  dir,
+		                                  tsplat,
+		                                  Psplat,
+		                                  idirsplat,
+		                                  shufflexyz,
+		                                  node_addr,
+		                                  visibility,
+		                                  dist);
+	}
+}
+
+ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+                                                const float3& P,
+                                                const float3& dir,
+                                                const ssef& isect_near,
+                                                const ssef& isect_far,
+                                                const ssef& tsplat,
+                                                const ssef Psplat[3],
+                                                const ssef idirsplat[3],
+                                                const shuffle_swap_t shufflexyz[3],
+                                                const float difl,
+                                                const float extmax,
+                                                const int node_addr,
+                                                const uint visibility,
+                                                float dist[2])
+{
+	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return bvh_unaligned_node_intersect_robust(kg,
+		                                           P,
+		                                           dir,
+		                                           isect_near,
+		                                           isect_far,
+		                                           difl,
+		                                           node_addr,
+		                                           visibility,
+		                                           dist);
+	}
+	else {
+		return bvh_aligned_node_intersect_robust(kg,
+		                                         P,
+		                                         dir,
+		                                         tsplat,
+		                                         Psplat,
+		                                         idirsplat,
+		                                         shufflexyz,
+		                                         difl,
+		                                         extmax,
+		                                         node_addr,
+		                                         visibility,
+		                                         dist);
+	}
+}
+#endif  /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 4005489f77d..1869457f0c3 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -18,7 +18,13 @@
  */
 
 #ifdef __QBVH__
-#  include "geom_qbvh_shadow.h"
+#  include "qbvh_shadow_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif
 
 /* This is a template BVH traversal function, where various features can be
@@ -41,14 +47,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 * - likely and unlikely for if() statements
 	 * - test restrict attribute for pointers
 	 */
-	
+
 	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	int traversal_stack[BVH_STACK_SIZE];
+	traversal_stack[0] = ENTRYPOINT_SENTINEL;
 
 	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* ray parameters in registers */
 	const float tmax = ray->t;
@@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
 	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = ssef(P.x);
@@ -93,130 +102,87 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				int node_addr_ahild1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
-#  else
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
 #  endif
-
+				                               idir,
+				                               isect_t,
+				                               node_addr,
+				                               PATH_RAY_SHADOW,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
-#  else
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
 #  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               node_addr,
+				                               PATH_RAY_SHADOW,
+				                               dist);
 #endif // __KERNEL_SSE2__
 
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				node_addr = __float_as_int(cnodes.z);
+				node_addr_ahild1 = __float_as_int(cnodes.w);
 
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool is_closest_child1 = (dist[1] < dist[0]);
+					if(is_closest_child1) {
+						int tmp = node_addr;
+						node_addr = node_addr_ahild1;
+						node_addr_ahild1 = tmp;
 					}
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = node_addr_ahild1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						node_addr = node_addr_ahild1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 			}
 
 			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					const int primAddr2 = __float_as_int(leaf.y);
+					const int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 					const uint p_type = type & PRIMITIVE_ALL;
 
 					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr];
+					--stack_ptr;
 
 					/* primitive intersection */
-					while(primAddr < primAddr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+					while(prim_addr < prim_addr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 
 						bool hit;
 
@@ -226,22 +192,57 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
+								hit = triangle_intersect(kg,
+								                         &isect_precalc,
+								                         isect_array,
+								                         P,
+								                         PATH_RAY_SHADOW,
+								                         object,
+								                         prim_addr);
 								break;
 							}
 #if BVH_FEATURE(BVH_MOTION)
 							case PRIMITIVE_MOTION_TRIANGLE: {
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								hit = motion_triangle_intersect(kg,
+								                                isect_array,
+								                                P,
+								                                dir,
+								                                ray->time,
+								                                PATH_RAY_SHADOW,
+								                                object,
+								                                prim_addr);
 								break;
 							}
 #endif
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
-									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
-								else
-									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+									hit = bvh_cardinal_curve_intersect(kg,
+									                                   isect_array,
+									                                   P,
+									                                   dir,
+									                                   PATH_RAY_SHADOW,
+									                                   object,
+									                                   prim_addr,
+									                                   ray->time,
+									                                   type,
+									                                   NULL,
+									                                   0, 0);
+								}
+								else {
+									hit = bvh_curve_intersect(kg,
+									                          isect_array,
+									                          P,
+									                          dir,
+									                          PATH_RAY_SHADOW,
+									                          object,
+									                          prim_addr,
+									                          ray->time,
+									                          type,
+									                          NULL,
+									                          0, 0);
+								}
 								break;
 							}
 #endif
@@ -253,6 +254,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 						/* shadow ray early termination */
 						if(hit) {
+							/* Update number of hits now, so we do proper check on max bounces. */
+							(*num_hits)++;
+
 							/* detect if this surface has a shader with transparent shadows */
 
 							/* todo: optimize so primitive visibility flag indicates if
@@ -283,23 +287,20 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								return true;
 							}
 
-							/* move on to next entry in intersections array */
-							isect_array++;
-							(*num_hits)++;
 #if BVH_FEATURE(BVH_INSTANCING)
 							num_hits_in_instance++;
 #endif
-
-							isect_array->t = isect_t;
+							/* Move on to next entry in intersections array */
+							isect_array++;
 						}
 
-						primAddr++;
+						prim_addr++;
 					}
 				}
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
@@ -317,21 +318,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					Psplat[2] = ssef(P.z);
 
 					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+					tfar = ssef(isect_t);
+#    endif
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-					nodeAddr = kernel_tex_fetch(__object_node, object);
+					node_addr = kernel_tex_fetch(__object_node, object);
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			if(num_hits_in_instance) {
@@ -369,15 +373,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);
 
 			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect_t);
+#    endif
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr];
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return false;
 }
@@ -410,3 +417,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
index 915e9415c93..18978efcfa3 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -18,7 +18,13 @@
  */
 
 #ifdef __QBVH__
-#  include "geom_qbvh_subsurface.h"
+#  include "qbvh_subsurface.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif
 
 /* This is a template BVH traversal function for subsurface scattering, where
@@ -44,12 +50,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 */
 
 	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	int traversal_stack[BVH_STACK_SIZE];
+	traversal_stack[0] = ENTRYPOINT_SENTINEL;
 
 	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object);
+	int stack_ptr = 0;
+	int node_addr = kernel_tex_fetch(__object_node, subsurface_object);
 
 	/* ray parameters in registers */
 	float3 P = ray->P;
@@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
 	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = ssef(P.x);
@@ -100,127 +109,94 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 	/* traversal loop */
 	do {
-		do
-		{
+		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
-			{
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				int node_addr_child1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               node_addr,
+				                               PATH_RAY_ALL_VISIBILITY,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               node_addr,
+				                               PATH_RAY_ALL_VISIBILITY,
+				                               dist);
 #endif // __KERNEL_SSE2__
 
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
-
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				node_addr = __float_as_int(cnodes.z);
+				node_addr_child1 = __float_as_int(cnodes.w);
 
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool is_closest_child1 = (dist[1] < dist[0]);
+					if(is_closest_child1) {
+						int tmp = node_addr;
+						node_addr = node_addr_child1;
+						node_addr_child1 = tmp;
 					}
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = node_addr_child1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						node_addr = node_addr_child1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 			}
 
 			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
-				const int primAddr2 = __float_as_int(leaf.y);
+				const int prim_addr2 = __float_as_int(leaf.y);
 				const uint type = __float_as_int(leaf.w);
 
 				/* pop */
-				nodeAddr = traversalStack[stackPtr];
-				--stackPtr;
+				node_addr = traversal_stack[stack_ptr];
+				--stack_ptr;
 
 				/* primitive intersection */
 				switch(type & PRIMITIVE_ALL) {
 					case PRIMITIVE_TRIANGLE: {
 						/* intersect ray against primitive */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
 							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
 							                              object,
-							                              primAddr,
+							                              prim_addr,
 							                              isect_t,
 							                              lcg_state,
 							                              max_hits);
@@ -230,15 +206,15 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 					case PRIMITIVE_MOTION_TRIANGLE: {
 						/* intersect ray against primitive */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							motion_triangle_intersect_subsurface(kg,
 							                                     ss_isect,
 							                                     P,
 							                                     dir,
 							                                     ray->time,
 							                                     object,
-							                                     primAddr,
+							                                     prim_addr,
 							                                     isect_t,
 							                                     lcg_state,
 							                                     max_hits);
@@ -251,8 +227,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					}
 				}
 			}
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 }
 
 ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
@@ -286,3 +262,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
 
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index ae919ef3f86..68a11b65ad7 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -18,7 +18,15 @@
  */
 
 #ifdef __QBVH__
-#  include "geom_qbvh_traversal.h"
+#  include "qbvh_traversal.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#  define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
 #endif
 
 /* This is a template BVH traversal function, where various features can be
@@ -49,14 +57,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 * - likely and unlikely for if() statements
 	 * - test restrict attribute for pointers
 	 */
-	
+
 	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	int traversal_stack[BVH_STACK_SIZE];
+	traversal_stack[0] = ENTRYPOINT_SENTINEL;
 
 	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* ray parameters in registers */
 	float3 P = ray->P;
@@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect->t);
+#  endif
 	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = ssef(P.x);
@@ -100,174 +111,148 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				int node_addr_child1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
 #  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
-					float hdiff = 1.0f + difl;
-					float ldiff = 1.0f - difl;
-					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-						c0min = max(ldiff * c0min, c0min - extmax);
-						c0max = min(hdiff * c0max, c0max + extmax);
-					}
-					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-						c1min = max(ldiff * c1min, c1min - extmax);
-						c1max = min(hdiff * c1max, c1max + extmax);
-					}
+					traverse_mask = NODE_INTERSECT_ROBUST(kg,
+					                                      P,
+#    if BVH_FEATURE(BVH_HAIR)
+					                                      dir,
+#    endif
+					                                      idir,
+					                                      isect->t,
+					                                      difl,
+					                                      extmax,
+					                                      node_addr,
+					                                      visibility,
+					                                      dist);
 				}
+				else
 #  endif
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#  else
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-#  endif
-
+				{
+					traverse_mask = NODE_INTERSECT(kg,
+					                               P,
+#    if BVH_FEATURE(BVH_HAIR)
+					                               dir,
+#    endif
+					                               idir,
+					                               isect->t,
+					                               node_addr,
+					                               visibility,
+					                               dist);
+				}
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-
 #  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
-					float4 *tminmaxview = (float4*)&tminmax;
-					float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
-					float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
-
-					float hdiff = 1.0f + difl;
-					float ldiff = 1.0f - difl;
-					if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-						c0min = max(ldiff * c0min, c0min - extmax);
-						c0max = min(hdiff * c0max, c0max + extmax);
-					}
-					if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-						c1min = max(ldiff * c1min, c1min - extmax);
-						c1max = min(hdiff * c1max, c1max + extmax);
-					}
+					traverse_mask = NODE_INTERSECT_ROBUST(kg,
+					                                      P,
+					                                      dir,
+#    if BVH_FEATURE(BVH_HAIR)
+					                                      tnear,
+					                                      tfar,
+#    endif
+					                                      tsplat,
+					                                      Psplat,
+					                                      idirsplat,
+					                                      shufflexyz,
+					                                      difl,
+					                                      extmax,
+					                                      node_addr,
+					                                      visibility,
+					                                      dist);
 				}
+				else
 #  endif
-
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-#  ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-#  else
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
-#  endif
+				{
+					traverse_mask = NODE_INTERSECT(kg,
+					                               P,
+					                               dir,
+#    if BVH_FEATURE(BVH_HAIR)
+					                               tnear,
+					                               tfar,
+#    endif
+					                               tsplat,
+					                               Psplat,
+					                               idirsplat,
+					                               shufflexyz,
+					                               node_addr,
+					                               visibility,
+					                               dist);
+				}
 #endif // __KERNEL_SSE2__
 
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
-
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				node_addr = __float_as_int(cnodes.z);
+				node_addr_child1 = __float_as_int(cnodes.w);
 
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool is_closest_child1 = (dist[1] < dist[0]);
+					if(is_closest_child1) {
+						int tmp = node_addr;
+						node_addr = node_addr_child1;
+						node_addr_child1 = tmp;
 					}
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = node_addr_child1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						node_addr = node_addr_child1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 				BVH_DEBUG_NEXT_STEP();
 			}
 
 			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					const int primAddr2 = __float_as_int(leaf.y);
+					const int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 
 					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr];
+					--stack_ptr;
 
 					/* primitive intersection */
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
+							for(; prim_addr < prim_addr2; prim_addr++) {
 								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								if(triangle_intersect(kg,
+								                      &isect_precalc,
+								                      isect,
+								                      P,
+								                      visibility,
+								                      object,
+								                      prim_addr))
+								{
 									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#  endif
 #else
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
@@ -278,15 +263,26 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						}
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
+							for(; prim_addr < prim_addr2; prim_addr++) {
 								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								if(motion_triangle_intersect(kg,
+								                             isect,
+								                             P,
+								                             dir,
+								                             ray->time,
+								                             visibility,
+								                             object,
+								                             prim_addr))
+								{
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#    endif
 #  else
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
@@ -299,20 +295,47 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
-							for(; primAddr < primAddr2; primAddr++) {
+							for(; prim_addr < prim_addr2; prim_addr++) {
 								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								bool hit;
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
-									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								else
-									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+									hit = bvh_cardinal_curve_intersect(kg,
+									                                   isect,
+									                                   P,
+									                                   dir,
+									                                   visibility,
+									                                   object,
+									                                   prim_addr,
+									                                   ray->time,
+									                                   type,
+									                                   lcg_state,
+									                                   difl,
+									                                   extmax);
+								}
+								else {
+									hit = bvh_curve_intersect(kg,
+									                          isect,
+									                          P,
+									                          dir,
+									                          visibility,
+									                          object,
+									                          prim_addr,
+									                          ray->time,
+									                          type,
+									                          lcg_state,
+									                          difl,
+									                          extmax);
+								}
 								if(hit) {
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+									tfar = ssef(isect->t);
+#    endif
 #  else
 									if(visibility == PATH_RAY_SHADOW_OPAQUE)
 										return true;
@@ -327,7 +350,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
@@ -342,24 +365,27 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					Psplat[2] = ssef(P.z);
 
 					tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+					tfar = ssef(isect->t);
+#    endif
 
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-					nodeAddr = kernel_tex_fetch(__object_node, object);
+					node_addr = kernel_tex_fetch(__object_node, object);
 
 					BVH_DEBUG_NEXT_INSTANCE();
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
@@ -376,16 +402,19 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);
 
 			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect->t);
+#    endif
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr];
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return (isect->prim != PRIM_NONE);
 }
@@ -433,3 +462,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index f3edf85d723..03499e94347 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -18,7 +18,13 @@
  */
 
 #ifdef __QBVH__
-#include "geom_qbvh_volume.h"
+#  include "qbvh_volume.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif
 
 /* This is a template BVH traversal function for volumes, where
@@ -43,12 +49,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 */
 
 	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	int traversal_stack[BVH_STACK_SIZE];
+	traversal_stack[0] = ENTRYPOINT_SENTINEL;
 
 	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* ray parameters in registers */
 	float3 P = ray->P;
@@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect->t);
+#  endif
 	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = ssef(P.x);
@@ -90,143 +99,124 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				int node_addr_child1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect->t,
+				                               node_addr,
+				                               visibility,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               node_addr,
+				                               visibility,
+				                               dist);
 #endif // __KERNEL_SSE2__
 
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
-
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
+				node_addr = __float_as_int(cnodes.z);
+				node_addr_child1 = __float_as_int(cnodes.w);
 
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool is_closest_child1 = (dist[1] < dist[0]);
+					if(is_closest_child1) {
+						int tmp = node_addr;
+						node_addr = node_addr_child1;
+						node_addr_child1 = tmp;
 					}
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = node_addr_child1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						node_addr = node_addr_child1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 			}
 
 			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					const int primAddr2 = __float_as_int(leaf.y);
+					const int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 
 					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr];
+					--stack_ptr;
 
 					/* primitive intersection */
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+								triangle_intersect(kg,
+								                   &isect_precalc,
+								                   isect,
+								                   P,
+								                   visibility,
+								                   object,
+								                   prim_addr);
 							}
 							break;
 						}
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
-								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+								motion_triangle_intersect(kg,
+								                          isect,
+								                          P,
+								                          dir,
+								                          ray->time,
+								                          visibility,
+								                          object,
+								                          prim_addr);
 							}
 							break;
 						}
@@ -239,7 +229,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
@@ -258,29 +248,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						Psplat[2] = ssef(P.z);
 
 						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+						tfar = ssef(isect->t);
+#    endif
 
 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_STACK_SIZE);
-						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_STACK_SIZE);
+						traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-						nodeAddr = kernel_tex_fetch(__object_node, object);
+						node_addr = kernel_tex_fetch(__object_node, object);
 					}
 					else {
 						/* pop */
 						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
@@ -298,16 +291,19 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);
 
 			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect->t);
+#    endif
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr];
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_MOTION) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return (isect->prim != PRIM_NONE);
 }
@@ -337,3 +333,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index ec837212471..b5405e8e57b 100644
--- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -18,7 +18,13 @@
  */
 
 #ifdef __QBVH__
-#include "geom_qbvh_volume_all.h"
+#  include "qbvh_volume_all.h"
+#endif
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT bvh_node_intersect
+#else
+#  define NODE_INTERSECT bvh_aligned_node_intersect
 #endif
 
 /* This is a template BVH traversal function for volumes, where
@@ -44,12 +50,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	 */
 
 	/* traversal stack in CUDA thread-local memory */
-	int traversalStack[BVH_STACK_SIZE];
-	traversalStack[0] = ENTRYPOINT_SENTINEL;
+	int traversal_stack[BVH_STACK_SIZE];
+	traversal_stack[0] = ENTRYPOINT_SENTINEL;
 
 	/* traversal variables in registers */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* ray parameters in registers */
 	const float tmax = ray->t;
@@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-	
+
 	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
 	ssef Psplat[3], idirsplat[3];
+#  if BVH_FEATURE(BVH_HAIR)
+	ssef tnear(0.0f), tfar(isect_t);
+#  endif
 	shuffle_swap_t shufflexyz[3];
 
 	Psplat[0] = ssef(P.x);
@@ -94,137 +103,109 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				bool traverseChild0, traverseChild1;
-				int nodeAddrChild1;
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				int node_addr_child1, traverse_mask;
+				float dist[2];
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
 #if !defined(__KERNEL_SSE2__)
-				/* Intersect two child bounding boxes, non-SSE version */
-				float t = isect_array->t;
-
-				/* fetch node data */
-				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
-				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
-				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
-				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
-
-				/* intersect ray against child nodes */
-				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
-
-				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
-				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
-				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
-				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (c0max >= c0min);
-				traverseChild1 = (c1max >= c1min);
-
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               dir,
+#  endif
+				                               idir,
+				                               isect_t,
+				                               node_addr,
+				                               visibility,
+				                               dist);
 #else // __KERNEL_SSE2__
-				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-
-				/* fetch node data */
-				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				const float4 cnodes = ((float4*)bvh_nodes)[3];
-
-				/* intersect ray against child nodes */
-				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-				/* calculate { c0min, c1min, -c0max, -c1max} */
-				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-				const ssef tminmax = minmax ^ pn;
-
-				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-				/* decide which nodes to traverse next */
-				traverseChild0 = (movemask(lrhit) & 1);
-				traverseChild1 = (movemask(lrhit) & 2);
+				traverse_mask = NODE_INTERSECT(kg,
+				                               P,
+				                               dir,
+#  if BVH_FEATURE(BVH_HAIR)
+				                               tnear,
+				                               tfar,
+#  endif
+				                               tsplat,
+				                               Psplat,
+				                               idirsplat,
+				                               shufflexyz,
+				                               node_addr,
+				                               visibility,
+				                               dist);
 #endif // __KERNEL_SSE2__
 
-				nodeAddr = __float_as_int(cnodes.x);
-				nodeAddrChild1 = __float_as_int(cnodes.y);
+				node_addr = __float_as_int(cnodes.z);
+				node_addr_child1 = __float_as_int(cnodes.w);
 
-				if(traverseChild0 && traverseChild1) {
-					/* both children were intersected, push the farther one */
-#if !defined(__KERNEL_SSE2__)
-					bool closestChild1 = (c1min < c0min);
-#else
-					bool closestChild1 = tminmax[1] < tminmax[0];
-#endif
-
-					if(closestChild1) {
-						int tmp = nodeAddr;
-						nodeAddr = nodeAddrChild1;
-						nodeAddrChild1 = tmp;
+				if(traverse_mask == 3) {
+					/* Both children were intersected, push the farther one. */
+					bool is_closest_child1 = (dist[1] < dist[0]);
+					if(is_closest_child1) {
+						int tmp = node_addr;
+						node_addr = node_addr_child1;
+						node_addr_child1 = tmp;
 					}
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_STACK_SIZE);
-					traversalStack[stackPtr] = nodeAddrChild1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_STACK_SIZE);
+					traversal_stack[stack_ptr] = node_addr_child1;
 				}
 				else {
-					/* one child was intersected */
-					if(traverseChild1) {
-						nodeAddr = nodeAddrChild1;
+					/* One child was intersected. */
+					if(traverse_mask == 2) {
+						node_addr = node_addr_child1;
 					}
-					else if(!traverseChild0) {
-						/* neither child was intersected */
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+					else if(traverse_mask == 0) {
+						/* Neither child was intersected. */
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 			}
 
 			/* if node is leaf, fetch triangle list */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					const int primAddr2 = __float_as_int(leaf.y);
+					const int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 					bool hit;
 
 					/* pop */
-					nodeAddr = traversalStack[stackPtr];
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr];
+					--stack_ptr;
 
 					/* primitive intersection */
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								hit = triangle_intersect(kg,
+								                         &isect_precalc,
+								                         isect_array,
+								                         P,
+								                         visibility,
+								                         object,
+								                         prim_addr);
 								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
+									/* Update number of hits now, so we do proper check on max bounces. */
 									num_hits++;
 #if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #endif
-									isect_array->t = isect_t;
 									if(num_hits == max_hits) {
 #if BVH_FEATURE(BVH_INSTANCING)
 #  if BVH_FEATURE(BVH_MOTION)
@@ -239,6 +220,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
+									/* Move on to next entry in intersections array */
+									isect_array++;
+									isect_array->t = isect_t;
 								}
 							}
 							break;
@@ -246,23 +230,28 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							/* intersect ray against primitive */
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* only primitives from volume object */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								hit = motion_triangle_intersect(kg,
+								                                isect_array,
+								                                P,
+								                                dir,
+								                                ray->time,
+								                                visibility,
+								                                object,
+								                                prim_addr);
 								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
+									/* Update number of hits now, so we do proper check on max bounces. */
 									num_hits++;
 #  if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #  endif
-									isect_array->t = isect_t;
 									if(num_hits == max_hits) {
 #  if BVH_FEATURE(BVH_INSTANCING)
 #    if BVH_FEATURE(BVH_MOTION)
@@ -277,6 +266,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #  endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
+									/* Move on to next entry in intersections array */
+									isect_array++;
+									isect_array->t = isect_t;
 								}
 							}
 							break;
@@ -290,7 +282,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
@@ -311,29 +303,32 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						Psplat[2] = ssef(P.z);
 
 						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+						tfar = ssef(isect_t);
+#    endif
 
 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_STACK_SIZE);
-						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_STACK_SIZE);
+						traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-						nodeAddr = kernel_tex_fetch(__object_node, object);
+						node_addr = kernel_tex_fetch(__object_node, object);
 					}
 					else {
 						/* pop */
 						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr];
-						--stackPtr;
+						node_addr = traversal_stack[stack_ptr];
+						--stack_ptr;
 					}
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			if(num_hits_in_instance) {
@@ -368,16 +363,19 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			Psplat[2] = ssef(P.z);
 
 			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+#    if BVH_FEATURE(BVH_HAIR)
+			tfar = ssef(isect_t);
+#    endif
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #  endif
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr];
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr];
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_MOTION) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return num_hits;
 }
@@ -410,3 +408,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
 
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
new file mode 100644
index 00000000000..4d8695bedec
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+struct QBVHStackItem {
+	int addr;
+	float dist;
+};
+
+/* TOOD(sergey): Investigate if using intrinsics helps for both
+ * stack item swap and float comparison.
+ */
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a,
+                                      QBVHStackItem *ccl_restrict b)
+{
+	QBVHStackItem tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+                                       QBVHStackItem *ccl_restrict s2,
+                                       QBVHStackItem *ccl_restrict s3)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+                                       QBVHStackItem *ccl_restrict s2,
+                                       QBVHStackItem *ccl_restrict s3,
+                                       QBVHStackItem *ccl_restrict s4)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
+	if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
+	if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+                                                  const ssef& isect_near,
+                                                  const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+                                                  const sse3f& org_idir,
+#else
+                                                  const sse3f& org,
+#endif
+                                                  const sse3f& idir,
+                                                  const int near_x,
+                                                  const int near_y,
+                                                  const int near_z,
+                                                  const int far_x,
+                                                  const int far_y,
+                                                  const int far_z,
+                                                  const int node_addr,
+                                                  ssef *ccl_restrict dist)
+{
+	const int offset = node_addr + 1;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
+#endif
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near));
+	const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far));
+	const sseb vmask = cast(tnear) > cast(tfar);
+	int mask = (int)movemask(vmask)^0xf;
+#else
+	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const sseb vmask = tnear <= tfar;
+	int mask = (int)movemask(vmask);
+#endif
+	*dist = tnear;
+	return mask;
+}
+
+ccl_device_inline int qbvh_aligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const ssef& isect_near,
+        const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#else
+        const sse3f& P,
+#endif
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        ssef *ccl_restrict dist)
+{
+	const int offset = node_addr + 1;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
+#endif
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const sseb vmask = round_down*tnear <= round_up*tfar;
+	*dist = tnear;
+	return (int)movemask(vmask);
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int qbvh_unaligned_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const ssef& isect_near,
+        const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+        const sse3f& org_idir,
+#endif
+        const sse3f& org,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        ssef *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+	const ssef nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const ssef tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const ssef tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear_x = mini(tlower_x, tupper_x);
+	const ssef tnear_y = mini(tlower_y, tupper_y);
+	const ssef tnear_z = mini(tlower_z, tupper_z);
+	const ssef tfar_x = maxi(tlower_x, tupper_x);
+	const ssef tfar_y = maxi(tlower_y, tupper_y);
+	const ssef tfar_z = maxi(tlower_z, tupper_z);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = tnear <= tfar;
+	*dist = tnear;
+	return movemask(vmask);
+#else
+	const ssef tnear_x = min(tlower_x, tupper_x);
+	const ssef tnear_y = min(tlower_y, tupper_y);
+	const ssef tnear_z = min(tlower_z, tupper_z);
+	const ssef tfar_x = max(tlower_x, tupper_x);
+	const ssef tfar_y = max(tlower_y, tupper_y);
+	const ssef tfar_z = max(tlower_z, tupper_z);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = tnear <= tfar;
+	*dist = tnear;
+	return movemask(vmask);
+#endif
+}
+
+ccl_device_inline int qbvh_unaligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const ssef& isect_near,
+        const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#endif
+        const sse3f& P,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        ssef *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
+	const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2);
+	const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3);
+
+	const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4);
+	const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5);
+	const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6);
+
+	const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7);
+	const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8);
+	const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9);
+
+	const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10);
+	const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11);
+	const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12);
+
+	const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+	           aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+	           aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+	const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+	           aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+	           aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+	const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
+	const ssef nrdir_x = neg_one / aligned_dir_x,
+	           nrdir_y = neg_one / aligned_dir_y,
+	           nrdir_z = neg_one / aligned_dir_z;
+
+	const ssef tlower_x = aligned_P_x * nrdir_x,
+	           tlower_y = aligned_P_y * nrdir_y,
+	           tlower_z = aligned_P_z * nrdir_z;
+
+	const ssef tupper_x = tlower_x - nrdir_x,
+	           tupper_y = tlower_y - nrdir_y,
+	           tupper_z = tlower_z - nrdir_z;
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+
+#ifdef __KERNEL_SSE41__
+	const ssef tnear_x = mini(tlower_x, tupper_x);
+	const ssef tnear_y = mini(tlower_y, tupper_y);
+	const ssef tnear_z = mini(tlower_z, tupper_z);
+	const ssef tfar_x = maxi(tlower_x, tupper_x);
+	const ssef tfar_y = maxi(tlower_y, tupper_y);
+	const ssef tfar_z = maxi(tlower_z, tupper_z);
+#else
+	const ssef tnear_x = min(tlower_x, tupper_x);
+	const ssef tnear_y = min(tlower_y, tupper_y);
+	const ssef tnear_z = min(tlower_z, tupper_z);
+	const ssef tfar_x = max(tlower_x, tupper_x);
+	const ssef tfar_y = max(tlower_y, tupper_y);
+	const ssef tfar_z = max(tlower_z, tupper_z);
+#endif
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+	const sseb vmask = round_down*tnear <= round_up*tfar;
+	*dist = tnear;
+	return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int qbvh_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const ssef& isect_near,
+        const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+        const sse3f& org_idir,
+#endif
+        const sse3f& org,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        ssef *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return qbvh_unaligned_node_intersect(kg,
+		                                     isect_near,
+		                                     isect_far,
+#ifdef __KERNEL_AVX2__
+		                                     org_idir,
+#endif
+		                                     org,
+		                                     dir,
+		                                     idir,
+		                                     near_x, near_y, near_z,
+		                                     far_x, far_y, far_z,
+		                                     node_addr,
+		                                     dist);
+	}
+	else {
+		return qbvh_aligned_node_intersect(kg,
+		                                   isect_near,
+		                                   isect_far,
+#ifdef __KERNEL_AVX2__
+		                                   org_idir,
+#else
+		                                   org,
+#endif
+		                                   idir,
+		                                   near_x, near_y, near_z,
+		                                   far_x, far_y, far_z,
+		                                   node_addr,
+		                                   dist);
+	}
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const ssef& isect_near,
+        const ssef& isect_far,
+#ifdef __KERNEL_AVX2__
+        const sse3f& P_idir,
+#endif
+        const sse3f& P,
+        const sse3f& dir,
+        const sse3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        ssef *ccl_restrict dist)
+{
+	const int offset = node_addr;
+	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+		return qbvh_unaligned_node_intersect_robust(kg,
+		                                            isect_near,
+		                                            isect_far,
+#ifdef __KERNEL_AVX2__
+		                                            P_idir,
+#endif
+		                                            P,
+		                                            dir,
+		                                            idir,
+		                                            near_x, near_y, near_z,
+		                                            far_x, far_y, far_z,
+		                                            node_addr,
+		                                            difl,
+		                                            dist);
+	}
+	else {
+		return qbvh_aligned_node_intersect_robust(kg,
+		                                          isect_near,
+		                                          isect_far,
+#ifdef __KERNEL_AVX2__
+		                                          P_idir,
+#else
+		                                          P,
+#endif
+		                                          idir,
+		                                          near_x, near_y, near_z,
+		                                          far_x, far_y, far_z,
+		                                          node_addr,
+		                                          difl,
+		                                          dist);
+	}
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index edb5b5c78c3..34753ff067d 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -27,6 +27,12 @@
  *
  */
 
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
@@ -39,12 +45,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	 */
 
 	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
 
 	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* Ray parameters in registers. */
 	const float tmax = ray->t;
@@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif
 
 	ssef tnear(0.0f), tfar(tmax);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 
 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif
 
 	/* Offsets to select the side that becomes the lower or upper bound. */
@@ -96,29 +106,53 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
-
-				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#  endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+					}
 
 					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
 						continue;
 					}
 
@@ -127,24 +161,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					 */
 					int c0 = __float_as_int(cnodes[r]);
 					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c1 = __float_as_int(cnodes[r]);
 					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
+					if(child_mask == 0) {
 						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
 							continue;
 						}
 						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
 							continue;
 						}
 					}
@@ -152,86 +186,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Here starts the slow path for 3 or 4 hit children. We push
 					 * all nodes onto the stack to sort them there.
 					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
 
 					/* Three children are hit, push all onto stack and sort 3
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c2 = __float_as_int(cnodes[r]);
 					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						qbvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
 						continue;
 					}
 
 					/* Four children are hit, push all onto stack and sort 4
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c3 = __float_as_int(cnodes[r]);
 					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+					qbvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3]);
 				}
 
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
 			}
 
 			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
 #ifdef __VISIBILITY_FLAG__
 				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
 					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
 					continue;
 				}
 #endif
 
-				int primAddr = __float_as_int(leaf.x);
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 					const uint p_type = type & PRIMITIVE_ALL;
 
 					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
 
 					/* Primitive intersection. */
-					while(primAddr < primAddr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+					while(prim_addr < prim_addr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 
 						bool hit;
 
@@ -241,22 +275,57 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
+								hit = triangle_intersect(kg,
+								                         &isect_precalc,
+								                         isect_array,
+								                         P,
+								                         PATH_RAY_SHADOW,
+								                         object,
+								                         prim_addr);
 								break;
 							}
 #if BVH_FEATURE(BVH_MOTION)
 							case PRIMITIVE_MOTION_TRIANGLE: {
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								hit = motion_triangle_intersect(kg,
+								                                isect_array,
+								                                P,
+								                                dir,
+								                                ray->time,
+								                                PATH_RAY_SHADOW,
+								                                object,
+								                                prim_addr);
 								break;
 							}
 #endif
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
-									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
-								else
-									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+									hit = bvh_cardinal_curve_intersect(kg,
+									                                   isect_array,
+									                                   P,
+									                                   dir,
+									                                   PATH_RAY_SHADOW,
+									                                   object,
+									                                   prim_addr,
+									                                   ray->time,
+									                                   type,
+									                                   NULL,
+									                                   0, 0);
+								}
+								else {
+									hit = bvh_curve_intersect(kg,
+									                          isect_array,
+									                          P,
+									                          dir,
+									                          PATH_RAY_SHADOW,
+									                          object,
+									                          prim_addr,
+									                          ray->time,
+									                          type,
+									                          NULL,
+									                          0, 0);
+								}
 								break;
 							}
 #endif
@@ -268,6 +337,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 						/* Shadow ray early termination. */
 						if(hit) {
+							/* Update number of hits now, so we do proper check on max bounces. */
+							(*num_hits)++;
+
 							/* detect if this surface has a shader with transparent shadows */
 
 							/* todo: optimize so primitive visibility flag indicates if
@@ -298,23 +370,21 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								return true;
 							}
 
-							/* move on to next entry in intersections array */
-							isect_array++;
-							(*num_hits)++;
 #if BVH_FEATURE(BVH_INSTANCING)
 							num_hits_in_instance++;
 #endif
-
+							/* Move on to next entry in intersections array */
+							isect_array++;
 							isect_array->t = isect_t;
 						}
 
-						primAddr++;
+						prim_addr++;
 					}
 				}
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
@@ -329,28 +399,33 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 					tfar = ssef(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 					P_idir = P*idir;
 					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 					triangle_intersect_precalc(dir, &isect_precalc);
 
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
 
-					nodeAddr = kernel_tex_fetch(__object_node, object);
+					node_addr = kernel_tex_fetch(__object_node, object);
 
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			if(num_hits_in_instance) {
@@ -383,21 +458,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(tmax);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return false;
 }
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
index 84512a8783c..03794e3a882 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -25,6 +25,12 @@
  *
  */
 
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              SubsurfaceIntersection *ss_isect,
@@ -41,12 +47,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	 */
 
 	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
 
 	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object);
+	int stack_ptr = 0;
+	int node_addr = kernel_tex_fetch(__object_node, subsurface_object);
 
 	/* Ray parameters in registers. */
 	float3 P = ray->P;
@@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif
 
 	ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 
 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif
 
 	/* Offsets to select the side that becomes the lower or upper bound. */
@@ -106,29 +116,43 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
 
-				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+				if(child_mask != 0) {
+					float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+					}
 
 					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
 						continue;
 					}
 
@@ -137,24 +161,24 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					 */
 					int c0 = __float_as_int(cnodes[r]);
 					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c1 = __float_as_int(cnodes[r]);
 					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
+					if(child_mask == 0) {
 						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
 							continue;
 						}
 						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
 							continue;
 						}
 					}
@@ -162,82 +186,82 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Here starts the slow path for 3 or 4 hit children. We push
 					 * all nodes onto the stack to sort them there.
 					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
 
 					/* Three children are hit, push all onto stack and sort 3
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c2 = __float_as_int(cnodes[r]);
 					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						qbvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
 						continue;
 					}
 
 					/* Four children are hit, push all onto stack and sort 4
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c3 = __float_as_int(cnodes[r]);
 					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+					qbvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3]);
 				}
 
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
 			}
 
 			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
-				int primAddr2 = __float_as_int(leaf.y);
+				int prim_addr2 = __float_as_int(leaf.y);
 				const uint type = __float_as_int(leaf.w);
 
 				/* Pop. */
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
 
 				/* Primitive intersection. */
 				switch(type & PRIMITIVE_ALL) {
 					case PRIMITIVE_TRIANGLE: {
 						/* Intersect ray against primitive, */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
 							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
 							                              object,
-							                              primAddr,
+							                              prim_addr,
 							                              isect_t,
 							                              lcg_state,
 							                              max_hits);
@@ -247,15 +271,15 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 					case PRIMITIVE_MOTION_TRIANGLE: {
 						/* Intersect ray against primitive. */
-						for(; primAddr < primAddr2; primAddr++) {
-							kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+						for(; prim_addr < prim_addr2; prim_addr++) {
+							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							motion_triangle_intersect_subsurface(kg,
 							                                     ss_isect,
 							                                     P,
 							                                     dir,
 							                                     ray->time,
 							                                     object,
-							                                     primAddr,
+							                                     prim_addr,
 							                                     isect_t,
 							                                     lcg_state,
 							                                     max_hits);
@@ -267,6 +291,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						break;
 				}
 			}
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 }
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
new file mode 100644
index 00000000000..f82ff661495
--- /dev/null
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -0,0 +1,505 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#  define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+	traversal_stack[0].dist = -FLT_MAX;
+
+	/* Traversal variables in registers. */
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
+	float node_dist = -FLT_MAX;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	BVH_DEBUG_INIT();
+
+	ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+				if(UNLIKELY(node_dist > isect->t)
+#ifdef __VISIBILITY_FLAG__
+				   || (__float_as_uint(inodes.x) & visibility) == 0)
+#endif
+				{
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+
+				int child_mask;
+				ssef dist;
+
+				BVH_DEBUG_NEXT_STEP();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					child_mask = NODE_INTERSECT_ROBUST(kg,
+					                                   tnear,
+					                                   tfar,
+#  ifdef __KERNEL_AVX2__
+					                                   P_idir4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                                   org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+					                                   dir4,
+#  endif
+					                                   idir4,
+					                                   near_x, near_y, near_z,
+					                                   far_x, far_y, far_z,
+					                                   node_addr,
+					                                   difl,
+					                                   &dist);
+				}
+				else
+#endif  /* BVH_HAIR_MINIMUM_WIDTH */
+				{
+					child_mask = NODE_INTERSECT(kg,
+					                            tnear,
+					                            tfar,
+#ifdef __KERNEL_AVX2__
+					                            P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					                            org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+					                            dir4,
+#endif
+					                            idir4,
+					                            near_x, near_y, near_z,
+					                            far_x, far_y, far_z,
+					                            node_addr,
+					                            &dist);
+				}
+
+				if(child_mask != 0) {
+					float4 cnodes;
+					/* TODO(sergey): Investigate whether moving cnodes upwards
+					 * gives a speedup (will be different cache pattern but will
+					 * avoid extra check here),
+					 */
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+					}
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(child_mask);
+					float d0 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
+						node_dist = d0;
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					r = __bscf(child_mask);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						if(d1 < d0) {
+							node_addr = c1;
+							node_dist = d1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
+							continue;
+						}
+						else {
+							node_addr = c0;
+							node_dist = d0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						qbvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						node_dist = traversal_stack[stack_ptr].dist;
+						--stack_ptr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(child_mask);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+					qbvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3]);
+				}
+
+				node_addr = traversal_stack[stack_ptr].addr;
+				node_dist = traversal_stack[stack_ptr].dist;
+				--stack_ptr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+#ifdef __VISIBILITY_FLAG__
+				if(UNLIKELY((node_dist > isect->t) ||
+				            ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+				if(UNLIKELY((node_dist > isect->t)))
+#endif
+				{
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+					continue;
+				}
+
+				int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(prim_addr >= 0) {
+#endif
+					int prim_addr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					node_dist = traversal_stack[stack_ptr].dist;
+					--stack_ptr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								if(triangle_intersect(kg,
+								                      &isect_precalc,
+								                      isect,
+								                      P,
+								                      visibility,
+								                      object,
+								                      prim_addr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								if(motion_triangle_intersect(kg,
+								                             isect,
+								                             P,
+								                             dir,
+								                             ray->time,
+								                             visibility,
+								                             object,
+								                             prim_addr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+									hit = bvh_cardinal_curve_intersect(kg,
+									                                   isect,
+									                                   P,
+									                                   dir,
+									                                   visibility,
+									                                   object,
+									                                   prim_addr,
+									                                   ray->time,
+									                                   type,
+									                                   lcg_state,
+									                                   difl,
+									                                   extmax);
+								}
+								else {
+									hit = bvh_curve_intersect(kg,
+									                          isect,
+									                          P,
+									                          dir,
+									                          visibility,
+									                          object,
+									                          prim_addr,
+									                          ray->time,
+									                          type,
+									                          lcg_state,
+									                          difl,
+									                          extmax);
+								}
+								if(hit) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+										return true;
+									}
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
+#  else
+					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
+#  endif
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+					traversal_stack[stack_ptr].dist = -FLT_MAX;
+
+					node_addr = kernel_tex_fetch(__object_node, object);
+
+					BVH_DEBUG_NEXT_INSTANCE();
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stack_ptr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+#  else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#  endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			node_addr = traversal_stack[stack_ptr].addr;
+			node_dist = traversal_stack[stack_ptr].dist;
+			--stack_ptr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(node_addr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index ab2e530dd20..b4f334eb842 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -26,6 +26,12 @@
  *
  */
 
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect,
@@ -38,12 +44,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	 */
 
 	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
 
 	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* Ray parameters in registers. */
 	float3 P = ray->P;
@@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	isect->object = OBJECT_NONE;
 
 	ssef tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 
 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif
 
 	/* Offsets to select the side that becomes the lower or upper bound. */
@@ -92,29 +102,52 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
-
-				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+					}
 
 					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
 						continue;
 					}
 
@@ -123,24 +156,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					 */
 					int c0 = __float_as_int(cnodes[r]);
 					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c1 = __float_as_int(cnodes[r]);
 					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
+					if(child_mask == 0) {
 						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
 							continue;
 						}
 						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
 							continue;
 						}
 					}
@@ -148,102 +181,102 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Here starts the slow path for 3 or 4 hit children. We push
 					 * all nodes onto the stack to sort them there.
 					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
 
 					/* Three children are hit, push all onto stack and sort 3
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c2 = __float_as_int(cnodes[r]);
 					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						qbvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
 						continue;
 					}
 
 					/* Four children are hit, push all onto stack and sort 4
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c3 = __float_as_int(cnodes[r]);
 					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+					qbvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3]);
 				}
 
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
 			}
 
 			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 					const uint p_type = type & PRIMITIVE_ALL;
 
 					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
 
 					/* Primitive intersection. */
 					switch(p_type) {
 						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr);
 							}
 							break;
 						}
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
 							}
 							break;
 						}
@@ -253,7 +286,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
@@ -268,34 +301,39 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 						tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 						P_idir = P*idir;
 						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 						triangle_intersect_precalc(dir, &isect_precalc);
 
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
 
-						nodeAddr = kernel_tex_fetch(__object_node, object);
+						node_addr = kernel_tex_fetch(__object_node, object);
 					}
 					else {
 						/* Pop. */
 						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
 					}
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* Instance pop. */
@@ -309,21 +347,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return (isect->prim != PRIM_NONE);
 }
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index 5546471b0e3..a877e5bb341 100644
--- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -26,6 +26,12 @@
  *
  */
 
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT qbvh_node_intersect
+#else
+#  define NODE_INTERSECT qbvh_aligned_node_intersect
+#endif
+
 ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
@@ -39,12 +45,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	 */
 
 	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
+	traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
 
 	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
+	int stack_ptr = 0;
+	int node_addr = kernel_data.bvh.root;
 
 	/* Ray parameters in registers. */
 	const float tmax = ray->t;
@@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif
 
 	ssef tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#endif
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 
 #ifdef __KERNEL_AVX2__
 	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+	sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+	sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
 #endif
 
 	/* Offsets to select the side that becomes the lower or upper bound. */
@@ -96,29 +106,52 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	do {
 		do {
 			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				if((__float_as_uint(inodes.x) & visibility) == 0) {
+					/* Pop. */
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
+					continue;
+				}
+#endif
+
 				ssef dist;
-				int traverseChild = qbvh_node_intersect(kg,
-				                                        tnear,
-				                                        tfar,
+				int child_mask = NODE_INTERSECT(kg,
+				                                tnear,
+				                                tfar,
 #ifdef __KERNEL_AVX2__
-				                                        P_idir4,
-#else
-				                                        org,
+				                                P_idir4,
 #endif
-				                                        idir4,
-				                                        near_x, near_y, near_z,
-				                                        far_x, far_y, far_z,
-				                                        nodeAddr,
-				                                        &dist);
-
-				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+				                                org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+				                                dir4,
+#endif
+				                                idir4,
+				                                near_x, near_y, near_z,
+				                                far_x, far_y, far_z,
+				                                node_addr,
+				                                &dist);
+
+				if(child_mask != 0) {
+					float4 cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+					if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13);
+					}
+					else
+#endif
+					{
+						cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7);
+					}
 
 					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
+					int r = __bscf(child_mask);
+					if(child_mask == 0) {
+						node_addr = __float_as_int(cnodes[r]);
 						continue;
 					}
 
@@ -127,24 +160,24 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					 */
 					int c0 = __float_as_int(cnodes[r]);
 					float d0 = ((float*)&dist)[r];
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c1 = __float_as_int(cnodes[r]);
 					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
+					if(child_mask == 0) {
 						if(d1 < d0) {
-							nodeAddr = c1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
+							node_addr = c1;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c0;
+							traversal_stack[stack_ptr].dist = d0;
 							continue;
 						}
 						else {
-							nodeAddr = c0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
+							node_addr = c0;
+							++stack_ptr;
+							kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+							traversal_stack[stack_ptr].addr = c1;
+							traversal_stack[stack_ptr].dist = d1;
 							continue;
 						}
 					}
@@ -152,96 +185,94 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Here starts the slow path for 3 or 4 hit children. We push
 					 * all nodes onto the stack to sort them there.
 					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c1;
+					traversal_stack[stack_ptr].dist = d1;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c0;
+					traversal_stack[stack_ptr].dist = d0;
 
 					/* Three children are hit, push all onto stack and sort 3
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c2 = __float_as_int(cnodes[r]);
 					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
+					if(child_mask == 0) {
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = c2;
+						traversal_stack[stack_ptr].dist = d2;
+						qbvh_stack_sort(&traversal_stack[stack_ptr],
+						                &traversal_stack[stack_ptr - 1],
+						                &traversal_stack[stack_ptr - 2]);
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
 						continue;
 					}
 
 					/* Four children are hit, push all onto stack and sort 4
 					 * stack items, continue with closest child.
 					 */
-					r = __bscf(traverseChild);
+					r = __bscf(child_mask);
 					int c3 = __float_as_int(cnodes[r]);
 					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c3;
+					traversal_stack[stack_ptr].dist = d3;
+					++stack_ptr;
+					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+					traversal_stack[stack_ptr].addr = c2;
+					traversal_stack[stack_ptr].dist = d2;
+					qbvh_stack_sort(&traversal_stack[stack_ptr],
+					                &traversal_stack[stack_ptr - 1],
+					                &traversal_stack[stack_ptr - 2],
+					                &traversal_stack[stack_ptr - 3]);
 				}
 
-				nodeAddr = traversalStack[stackPtr].addr;
-				--stackPtr;
+				node_addr = traversal_stack[stack_ptr].addr;
+				--stack_ptr;
 			}
 
 			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
-				int primAddr = __float_as_int(leaf.x);
+			if(node_addr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+				int prim_addr = __float_as_int(leaf.x);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
+				if(prim_addr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					int prim_addr2 = __float_as_int(leaf.y);
 					const uint type = __float_as_int(leaf.w);
 					const uint p_type = type & PRIMITIVE_ALL;
 					bool hit;
 
 					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					--stackPtr;
+					node_addr = traversal_stack[stack_ptr].addr;
+					--stack_ptr;
 
 					/* Primitive intersection. */
 					switch(p_type) {
 						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
 								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
+									/* Update number of hits now, so we do proper check on max bounces. */
 									num_hits++;
 #if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #endif
-									isect_array->t = isect_t;
 									if(num_hits == max_hits) {
 #if BVH_FEATURE(BVH_INSTANCING)
 #  if BVH_FEATURE(BVH_MOTION)
@@ -256,30 +287,31 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
+									/* Move on to next entry in intersections array */
+									isect_array++;
+									isect_array->t = isect_t;
 								}
 							}
 							break;
 						}
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+							for(; prim_addr < prim_addr2; prim_addr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								/* Only primitives from volume object. */
-								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
 								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
 								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
 								if(hit) {
-									/* Move on to next entry in intersections array. */
-									isect_array++;
+									/* Update number of hits now, so we do proper check on max bounces. */
 									num_hits++;
 #  if BVH_FEATURE(BVH_INSTANCING)
 									num_hits_in_instance++;
 #  endif
-									isect_array->t = isect_t;
 									if(num_hits == max_hits) {
 #  if BVH_FEATURE(BVH_INSTANCING)
 #    if BVH_FEATURE(BVH_MOTION)
@@ -294,6 +326,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #  endif  /* BVH_FEATURE(BVH_INSTANCING) */
 										return num_hits;
 									}
+									/* Move on to next entry in intersections array */
+									isect_array++;
+									isect_array->t = isect_t;
 								}
 							}
 							break;
@@ -304,7 +339,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
@@ -320,35 +355,40 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 						tfar = ssef(isect_t);
 						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#  if BVH_FEATURE(BVH_HAIR)
+						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 #  ifdef __KERNEL_AVX2__
 						P_idir = P*idir;
 						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+						++stack_ptr;
+						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
+						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
 
-						nodeAddr = kernel_tex_fetch(__object_node, object);
+						node_addr = kernel_tex_fetch(__object_node, object);
 					}
 					else {
 						/* Pop. */
 						object = OBJECT_NONE;
-						nodeAddr = traversalStack[stackPtr].addr;
-						--stackPtr;
+						node_addr = traversal_stack[stack_ptr].addr;
+						--stack_ptr;
 					}
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+		} while(node_addr != ENTRYPOINT_SENTINEL);
 
 #if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
+		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* Instance pop. */
@@ -379,23 +419,30 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
 			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
 			tfar = ssef(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
+#  endif
 			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  ifdef __KERNEL_AVX2__
 			P_idir = P*idir;
 			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#  endif
+
 			triangle_intersect_precalc(dir, &isect_precalc);
 			isect_t = tmax;
 			isect_array->t = isect_t;
 
 			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			--stackPtr;
+			node_addr = traversal_stack[stack_ptr].addr;
+			--stack_ptr;
 		}
 #endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return num_hits;
 }
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index c94a5384d1f..d2c7edb11ea 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -15,27 +15,6 @@
  * limitations under the License.
  */
 
-/* bottom-most stack entry, indicating the end of traversal */
-#define ENTRYPOINT_SENTINEL 0x76543210
-
-/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
-#define BVH_STACK_SIZE 192
-#define BVH_QSTACK_SIZE 384
-#define BVH_NODE_SIZE 4
-#define BVH_NODE_LEAF_SIZE 1
-#define BVH_QNODE_SIZE 7
-#define BVH_QNODE_LEAF_SIZE 1
-#define TRI_NODE_SIZE 3
-
-/* silly workaround for float extended precision that happens when compiling
- * without sse support on x86, it results in different results for float ops
- * that you would otherwise expect to compare correctly */
-#if !defined(__i386__) || defined(__SSE__)
-#  define NO_EXTENDED_PRECISION
-#else
-#  define NO_EXTENDED_PRECISION volatile
-#endif
-
 #include "geom_attribute.h"
 #include "geom_object.h"
 #include "geom_triangle.h"
@@ -45,5 +24,4 @@
 #include "geom_curve.h"
 #include "geom_volume.h"
 #include "geom_primitive.h"
-#include "geom_bvh.h"
 
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 8894843997c..292e1bfca0e 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -450,8 +450,8 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 		else if(level == 1) {
 
 			/* the maximum recursion depth is reached.
-			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			* dP* is reversed if necessary.*/
+			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			 * dP* is reversed if necessary.*/
 			float t = isect->t;
 			float u = 0.0f;
 			float gd = 0.0f;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index ffe55529110..2fb8e219884 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -47,13 +47,13 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg, int object, uint
 	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3])
+ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3])
 {
 	if(step == numsteps) {
 		/* center step: regular vertex location */
-		verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-		verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-		verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+		verts[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
+		verts[1] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
+		verts[2] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 	}
 	else {
 		/* center step not store in this array */
@@ -62,19 +62,19 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3
 
 		offset += step*numverts;
 
-		verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
-		verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
-		verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+		verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
+		verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
+		verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
 	}
 }
 
-ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3])
+ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3])
 {
 	if(step == numsteps) {
 		/* center step: regular vertex location */
-		normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
-		normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
-		normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
+		normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
+		normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
+		normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 	}
 	else {
 		/* center step not stored in this array */
@@ -83,9 +83,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float
 
 		offset += step*numverts;
 
-		normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
-		normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
-		normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+		normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
+		normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
+		normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
 	}
 }
 
@@ -107,7 +107,7 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i
 
 	/* fetch vertex coordinates */
 	float3 next_verts[3];
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@@ -259,7 +259,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 
 	/* fetch vertex coordinates */
 	float3 verts[3], next_verts[3];
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
deleted file mode 100644
index 2a2d7822eee..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-struct QBVHStackItem {
-	int addr;
-	float dist;
-};
-
-/* TOOD(sergey): Investigate if using intrinsics helps for both
- * stack item swap and float comparison.
- */
-ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
-                                      QBVHStackItem *__restrict b)
-{
-	QBVHStackItem tmp = *a;
-	*a = *b;
-	*b = tmp;
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
-                                       QBVHStackItem *__restrict s2,
-                                       QBVHStackItem *__restrict s3)
-{
-	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
-	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
-	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
-                                       QBVHStackItem *__restrict s2,
-                                       QBVHStackItem *__restrict s3,
-                                       QBVHStackItem *__restrict s4)
-{
-	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
-	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
-	if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
-	if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
-	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
-}
-
-ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
-                                          const ssef& tnear,
-                                          const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-                                          const sse3f& org_idir,
-#else
-                                          const sse3f& org,
-#endif
-                                          const sse3f& idir,
-                                          const int near_x,
-                                          const int near_y,
-                                          const int near_z,
-                                          const int far_x,
-                                          const int far_y,
-                                          const int far_z,
-                                          const int nodeAddr,
-                                          ssef *__restrict dist)
-{
-	const int offset = nodeAddr*BVH_QNODE_SIZE;
-#ifdef __KERNEL_AVX2__
-	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
-	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
-	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
-	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
-	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
-	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
-#else
-	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
-	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
-	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
-	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
-	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
-	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
-#endif
-
-#ifdef __KERNEL_SSE41__
-	const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear));
-	const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar));
-	const sseb vmask = cast(tNear) > cast(tFar);
-	int mask = (int)movemask(vmask)^0xf;
-#else
-	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
-	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
-	const sseb vmask = tNear <= tFar;
-	int mask = (int)movemask(vmask);
-#endif
-	*dist = tNear;
-	return mask;
-}
-
-ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
-                                                 const ssef& tnear,
-                                                 const ssef& tfar,
-#ifdef __KERNEL_AVX2__
-                                                 const sse3f& P_idir,
-#else
-                                                 const sse3f& P,
-#endif
-                                                 const sse3f& idir,
-                                                 const int near_x,
-                                                 const int near_y,
-                                                 const int near_z,
-                                                 const int far_x,
-                                                 const int far_y,
-                                                 const int far_z,
-                                                 const int nodeAddr,
-                                                 const float difl,
-                                                 ssef *__restrict dist)
-{
-	const int offset = nodeAddr*BVH_QNODE_SIZE;
-#ifdef __KERNEL_AVX2__
-	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
-	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
-	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
-	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
-	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
-	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
-#else
-	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
-	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
-	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
-	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
-	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
-	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
-#endif
-
-	const float round_down = 1.0f - difl;
-	const float round_up = 1.0f + difl;
-	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
-	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
-	const sseb vmask = round_down*tNear <= round_up*tFar;
-	*dist = tNear;
-	return (int)movemask(vmask);
-}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
deleted file mode 100644
index 738d08ac6fc..00000000000
--- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
- * BVH_MOTION: motion blur rendering
- *
- */
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                             ,uint *lcg_state,
-                                             float difl,
-                                             float extmax
-#endif
-                                             )
-{
-	/* TODO(sergey):
-	 * - Test if pushing distance on the stack helps (for non shadow rays).
-	 * - Separate version for shadow rays.
-	 * - Likely and unlikely for if() statements.
-	 * - Test restrict attribute for pointers.
-	 */
-
-	/* Traversal stack in CUDA thread-local memory. */
-	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
-	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
-	traversalStack[0].dist = -FLT_MAX;
-
-	/* Traversal variables in registers. */
-	int stackPtr = 0;
-	int nodeAddr = kernel_data.bvh.root;
-	float nodeDist = -FLT_MAX;
-
-	/* Ray parameters in registers. */
-	float3 P = ray->P;
-	float3 dir = bvh_clamp_direction(ray->D);
-	float3 idir = bvh_inverse_direction(dir);
-	int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-	Transform ob_itfm;
-#endif
-
-#ifndef __KERNEL_SSE41__
-	if(!isfinite(P.x)) {
-		return false;
-	}
-#endif
-
-	isect->t = ray->t;
-	isect->u = 0.0f;
-	isect->v = 0.0f;
-	isect->prim = PRIM_NONE;
-	isect->object = OBJECT_NONE;
-
-	BVH_DEBUG_INIT();
-
-	ssef tnear(0.0f), tfar(ray->t);
-	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-	float3 P_idir = P*idir;
-	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#else
-	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-	/* Offsets to select the side that becomes the lower or upper bound. */
-	int near_x, near_y, near_z;
-	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
-	/* Traversal loop. */
-	do {
-		do {
-			/* Traverse internal nodes. */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-				if(UNLIKELY(nodeDist > isect->t)) {
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					nodeDist = traversalStack[stackPtr].dist;
-					--stackPtr;
-					continue;
-				}
-
-				int traverseChild;
-				ssef dist;
-
-				BVH_DEBUG_NEXT_STEP();
-
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-				if(difl != 0.0f) {
-					/* NOTE: We extend all the child BB instead of fetching
-					 * and checking visibility flags for each of the,
-					 *
-					 * Need to test if doing opposite would be any faster.
-					 */
-					traverseChild = qbvh_node_intersect_robust(kg,
-					                                           tnear,
-					                                           tfar,
-#  ifdef __KERNEL_AVX2__
-					                                           P_idir4,
-#  else
-					                                           org,
-#  endif
-					                                           idir4,
-					                                           near_x, near_y, near_z,
-					                                           far_x, far_y, far_z,
-					                                           nodeAddr,
-					                                           difl,
-					                                           &dist);
-				}
-				else
-#endif  /* BVH_HAIR_MINIMUM_WIDTH */
-				{
-					traverseChild = qbvh_node_intersect(kg,
-					                                    tnear,
-					                                    tfar,
-#ifdef __KERNEL_AVX2__
-					                                    P_idir4,
-#else
-					                                    org,
-#endif
-					                                    idir4,
-					                                    near_x, near_y, near_z,
-					                                    far_x, far_y, far_z,
-					                                    nodeAddr,
-					                                    &dist);
-				}
-
-				if(traverseChild != 0) {
-					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
-
-					/* One child is hit, continue with that child. */
-					int r = __bscf(traverseChild);
-					float d0 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						nodeAddr = __float_as_int(cnodes[r]);
-						nodeDist = d0;
-						continue;
-					}
-
-					/* Two children are hit, push far child, and continue with
-					 * closer child.
-					 */
-					int c0 = __float_as_int(cnodes[r]);
-					r = __bscf(traverseChild);
-					int c1 = __float_as_int(cnodes[r]);
-					float d1 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						if(d1 < d0) {
-							nodeAddr = c1;
-							nodeDist = d1;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c0;
-							traversalStack[stackPtr].dist = d0;
-							continue;
-						}
-						else {
-							nodeAddr = c0;
-							nodeDist = d0;
-							++stackPtr;
-							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-							traversalStack[stackPtr].addr = c1;
-							traversalStack[stackPtr].dist = d1;
-							continue;
-						}
-					}
-
-					/* Here starts the slow path for 3 or 4 hit children. We push
-					 * all nodes onto the stack to sort them there.
-					 */
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c1;
-					traversalStack[stackPtr].dist = d1;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c0;
-					traversalStack[stackPtr].dist = d0;
-
-					/* Three children are hit, push all onto stack and sort 3
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c2 = __float_as_int(cnodes[r]);
-					float d2 = ((float*)&dist)[r];
-					if(traverseChild == 0) {
-						++stackPtr;
-						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-						traversalStack[stackPtr].addr = c2;
-						traversalStack[stackPtr].dist = d2;
-						qbvh_stack_sort(&traversalStack[stackPtr],
-						                &traversalStack[stackPtr - 1],
-						                &traversalStack[stackPtr - 2]);
-						nodeAddr = traversalStack[stackPtr].addr;
-						nodeDist = traversalStack[stackPtr].dist;
-						--stackPtr;
-						continue;
-					}
-
-					/* Four children are hit, push all onto stack and sort 4
-					 * stack items, continue with closest child.
-					 */
-					r = __bscf(traverseChild);
-					int c3 = __float_as_int(cnodes[r]);
-					float d3 = ((float*)&dist)[r];
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c3;
-					traversalStack[stackPtr].dist = d3;
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = c2;
-					traversalStack[stackPtr].dist = d2;
-					qbvh_stack_sort(&traversalStack[stackPtr],
-					                &traversalStack[stackPtr - 1],
-					                &traversalStack[stackPtr - 2],
-					                &traversalStack[stackPtr - 3]);
-				}
-
-				nodeAddr = traversalStack[stackPtr].addr;
-				nodeDist = traversalStack[stackPtr].dist;
-				--stackPtr;
-			}
-
-			/* If node is leaf, fetch triangle list. */
-			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
-
-#ifdef __VISIBILITY_FLAG__
-				if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
-#else
-				if(UNLIKELY((nodeDist > isect->t)))
-#endif
-				{
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					nodeDist = traversalStack[stackPtr].dist;
-					--stackPtr;
-					continue;
-				}
-
-				int primAddr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-				if(primAddr >= 0) {
-#endif
-					int primAddr2 = __float_as_int(leaf.y);
-					const uint type = __float_as_int(leaf.w);
-
-					/* Pop. */
-					nodeAddr = traversalStack[stackPtr].addr;
-					nodeDist = traversalStack[stackPtr].dist;
-					--stackPtr;
-
-					/* Primitive intersection. */
-					switch(type & PRIMITIVE_ALL) {
-						case PRIMITIVE_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
-									tfar = ssef(isect->t);
-									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-								}
-							}
-							break;
-						}
-#if BVH_FEATURE(BVH_MOTION)
-						case PRIMITIVE_MOTION_TRIANGLE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
-									tfar = ssef(isect->t);
-									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-								}
-							}
-							break;
-						}
-#endif  /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-						case PRIMITIVE_CURVE:
-						case PRIMITIVE_MOTION_CURVE: {
-							for(; primAddr < primAddr2; primAddr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
-								bool hit;
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
-									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								else
-									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								if(hit) {
-									tfar = ssef(isect->t);
-									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
-										return true;
-								}
-							}
-							break;
-						}
-#endif  /* BVH_FEATURE(BVH_HAIR) */
-					}
-				}
-#if BVH_FEATURE(BVH_INSTANCING)
-				else {
-					/* Instance push. */
-					object = kernel_tex_fetch(__prim_object, -primAddr-1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_itfm);
-#  else
-					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist);
-#  endif
-
-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-					tfar = ssef(isect->t);
-					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-					P_idir = P*idir;
-					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-					triangle_intersect_precalc(dir, &isect_precalc);
-
-					++stackPtr;
-					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
-					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
-					traversalStack[stackPtr].dist = -FLT_MAX;
-
-					nodeAddr = kernel_tex_fetch(__object_node, object);
-
-					BVH_DEBUG_NEXT_INSTANCE();
-				}
-			}
-#endif  /* FEATURE(BVH_INSTANCING) */
-		} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-		if(stackPtr >= 0) {
-			kernel_assert(object != OBJECT_NONE);
-
-			/* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
-#  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
-#  endif
-
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-			tfar = ssef(isect->t);
-			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-			P_idir = P*idir;
-			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  else
-			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-			triangle_intersect_precalc(dir, &isect_precalc);
-
-			object = OBJECT_NONE;
-			nodeAddr = traversalStack[stackPtr].addr;
-			nodeDist = traversalStack[stackPtr].dist;
-			--stackPtr;
-		}
-#endif  /* FEATURE(BVH_INSTANCING) */
-	} while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-	return (isect->prim != PRIM_NONE);
-}
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 995dfac5b09..0c2351e1d1b 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -27,12 +27,11 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
+	const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
+	const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
-	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-	
 	/* return normal */
 	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		return normalize(cross(v2 - v0, v1 - v0));
@@ -44,11 +43,10 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
 {
 	/* load triangle vertices */
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-
-	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+	float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
+	float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
+	float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* compute point */
 	float t = 1.0f - u - v;
@@ -71,11 +69,10 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int
 
 ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
 {
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-
-	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+	P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
+	P[1] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
+	P[2] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 }
 
 /* Interpolate smooth vertex normal from vertices */
@@ -83,11 +80,10 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
 {
 	/* load triangle vertices */
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-
-	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
-	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
-	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
+	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
+	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 
 	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
 }
@@ -97,11 +93,10 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
 {
 	/* fetch triangle vertex coordinates */
-	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-
-	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+	const float3 p0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
+	const float3 p1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
+	const float3 p2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* compute derivatives of P w.r.t. uv */
 	*dPdu = (p0 - p2);
@@ -119,11 +114,11 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
-		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
-		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
-		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
+		float f0 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.x);
+		float f1 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.y);
+		float f2 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
@@ -162,11 +157,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index b6dfc769012..fc081bda525 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -106,9 +106,10 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	const float Sz = isect_precalc->Sz;
 
 	/* Calculate vertices relative to ray origin. */
-	const float4 tri_a = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+0),
-	             tri_b = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+1),
-	             tri_c = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+2);
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
+	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
+	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
 	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
 	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
 	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
@@ -202,9 +203,10 @@ ccl_device_inline void triangle_intersect_subsurface(
 	const float Sz = isect_precalc->Sz;
 
 	/* Calculate vertices relative to ray origin. */
-	const float4 tri_a = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+0),
-	             tri_b = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+1),
-	             tri_c = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+2);
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
+	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
+	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
 	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
 	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
 	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
@@ -324,9 +326,10 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	P = P + D*t;
 
-	const float4 tri_a = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+0),
-	             tri_b = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+1),
-	             tri_c = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+2);
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
+	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
+	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
 	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
 	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
 	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
@@ -381,9 +384,10 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 	P = P + D*t;
 
 #ifdef __INTERSECTION_REFINE__
-	const float4 tri_a = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+0),
-	             tri_b = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+1),
-	             tri_c = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+2);
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
+	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
+	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
 	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
 	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
 	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 42314756f02..08f6f457805 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -42,6 +42,7 @@
 #define ccl_constant
 #define ccl_may_alias
 #define ccl_addr_space
+#define ccl_restrict __restrict__
 
 /* No assert supported for CUDA */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index a5708448e23..8505cb85576 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
 #define ccl_global __global
 #define ccl_local __local
 #define ccl_private __private
+#define ccl_restrict restrict
 
 #ifdef __SPLIT_KERNEL__
 #  define ccl_addr_space __global
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 736a884f819..93c4bd3f7d5 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -51,8 +51,8 @@ ccl_device float area_light_sample(float3 P,
                                    bool sample_coord)
 {
 	/* In our name system we're using P for the center,
-	* which is o in the paper.
-	*/
+	 * which is o in the paper.
+	 */
 
 	float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
 	float axisu_len, axisv_len;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 3c3503eab8b..d5b31037723 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -25,6 +25,7 @@
 #include "kernel_camera.h"
 
 #include "geom/geom.h"
+#include "bvh/bvh.h"
 
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 94598e2565e..731dc0407c5 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -309,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
 	state->num_samples = state->num_samples*num_branches;
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng, const ccl_addr_space PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
 {
 	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
 }
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 245d236ff97..5ba262c1044 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -25,7 +25,8 @@
 /* bvh */
 KERNEL_TEX(float4, texture_float4, __bvh_nodes)
 KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
-KERNEL_TEX(float4, texture_float4, __tri_storage)
+KERNEL_TEX(float4, texture_float4, __prim_tri_verts)
+KERNEL_TEX(uint, texture_uint, __prim_tri_index)
 KERNEL_TEX(uint, texture_uint, __prim_type)
 KERNEL_TEX(uint, texture_uint, __prim_visibility)
 KERNEL_TEX(uint, texture_uint, __prim_index)
@@ -39,8 +40,7 @@ KERNEL_TEX(float4, texture_float4, __objects_vector)
 /* triangles */
 KERNEL_TEX(uint, texture_uint, __tri_shader)
 KERNEL_TEX(float4, texture_float4, __tri_vnormal)
-KERNEL_TEX(float4, texture_float4, __tri_vindex)
-KERNEL_TEX(float4, texture_float4, __tri_verts)
+KERNEL_TEX(uint4, texture_uint4, __tri_vindex)
 
 /* curves */
 KERNEL_TEX(float4, texture_float4, __curves)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 76d2a6b98e6..5de58ba28ed 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -292,11 +292,14 @@ enum PathRayFlag {
 	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
 	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
 
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024),
+	/* Special flag to tag unaligned BVH nodes. */
+	PATH_RAY_NODE_UNALIGNED = 2048,
 
-	PATH_RAY_MIS_SKIP = 2048,
-	PATH_RAY_DIFFUSE_ANCESTOR = 4096,
-	PATH_RAY_SINGLE_PASS_DONE = 8192,
+	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+
+	PATH_RAY_MIS_SKIP = 4096,
+	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
+	PATH_RAY_SINGLE_PASS_DONE = 16384,
 };
 
 /* Closure Label */
@@ -769,7 +772,7 @@ typedef ccl_addr_space struct ShaderData {
 	int type;
 
 	/* parametric coordinates
-	* - barycentric weights for triangles */
+	 * - barycentric weights for triangles */
 	float u;
 	float v;
 	/* object id if there is one, ~0 otherwise */
@@ -792,14 +795,14 @@ typedef ccl_addr_space struct ShaderData {
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
-	* not readily suitable as a tangent for shading on triangles. */
+	 * not readily suitable as a tangent for shading on triangles. */
 	float3 dPdu;
 	float3 dPdv;
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
-	* re-interpolating them constantly for shading */
+	 * re-interpolating them constantly for shading */
 	Transform ob_tfm;
 	Transform ob_itfm;
 #endif
@@ -1171,11 +1174,11 @@ typedef ccl_addr_space struct DebugData {
 #define QUEUE_EMPTY_SLOT -1
 
 /*
-* Queue 1 - Active rays
-* Queue 2 - Background queue
-* Queue 3 - Shadow ray cast kernel - AO
-* Queeu 4 - Shadow ray cast kernel - direct lighting
-*/
+ * Queue 1 - Active rays
+ * Queue 2 - Background queue
+ * Queue 3 - Shadow ray cast kernel - AO
+ * Queeu 4 - Shadow ray cast kernel - direct lighting
+ */
 #define NUM_QUEUES 4
 
 /* Queue names */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index aad06ed5c76..37907cd8fdc 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -35,6 +35,7 @@
 #  include "../../kernel_montecarlo.h"
 #  include "../../kernel_projection.h"
 #  include "../../geom/geom.h"
+#  include "../../bvh/bvh.h"
 
 #  include "../../kernel_accumulate.h"
 #  include "../../kernel_camera.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index ebe739ebd0e..2bb2be5e6b3 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -47,6 +47,7 @@
 #include "kernel_camera.h"
 #include "kernels/cpu/kernel_cpu_image.h"
 #include "geom/geom.h"
+#include "bvh/bvh.h"
 
 #include "kernel_projection.h"
 #include "kernel_accumulate.h"
@@ -912,7 +913,7 @@ bool OSLRenderServices::texture(ustring filename,
 #endif
 	bool status;
 
-	if(filename[0] == '@') {
+	if(filename.length() && filename[0] == '@') {
 		int slot = atoi(filename.c_str() + 1);
 		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
 
@@ -993,7 +994,7 @@ bool OSLRenderServices::texture3d(ustring filename,
 	}
 
 	bool status;
-	if(filename[0] == '@') {
+	if(filename.length() && filename[0] == '@') {
 		int slot = atoi(filename.c_str() + 1);
 		float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
 
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 49030f33c26..b43f8402d42 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -81,6 +81,7 @@ set(SRC_OSL
 	node_wireframe.osl
 	node_hair_bsdf.osl
 	node_uv_map.osl
+	node_rgb_to_bw.osl
 )
 
 set(SRC_OSL_HEADERS
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index a00401845c8..7cd2922dd4f 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -88,7 +88,7 @@ shader node_image_texture(
 	string color_space = "sRGB",
 	string projection = "flat",
 	string interpolation = "smartcubic",
-	string wrap = "periodic",
+	string extension = "periodic",
 	float projection_blend = 0.0,
 	int is_float = 1,
 	int use_alpha = 1,
@@ -108,7 +108,7 @@ shader node_image_texture(
 		                             use_alpha,
 		                             is_float,
 		                             interpolation,
-		                             wrap);
+		                             extension);
 	}
 	else if (projection == "box") {
 		/* object space normal */
@@ -184,7 +184,7 @@ shader node_image_texture(
 			                                          use_alpha,
 			                                          is_float,
 			                                          interpolation,
-			                                          wrap);
+			                                          extension);
 			Alpha += weight[0] * tmp_alpha;
 		}
 		if (weight[1] > 0.0) {
@@ -195,7 +195,7 @@ shader node_image_texture(
 			                                          use_alpha,
 			                                          is_float,
 			                                          interpolation,
-			                                          wrap);
+			                                          extension);
 			Alpha += weight[1] * tmp_alpha;
 		}
 		if (weight[2] > 0.0) {
@@ -206,7 +206,7 @@ shader node_image_texture(
 			                                          use_alpha,
 			                                          is_float,
 			                                          interpolation,
-			                                          wrap);
+			                                          extension);
 			Alpha += weight[2] * tmp_alpha;
 		}
 	}
@@ -219,7 +219,7 @@ shader node_image_texture(
 		                             use_alpha,
 		                             is_float,
 		                             interpolation,
-		                             wrap);
+		                             extension);
 	}
 	else if (projection == "tube") {
 		point projected = map_to_tube(texco_remap_square(p));
@@ -230,6 +230,6 @@ shader node_image_texture(
 		                             use_alpha,
 		                             is_float,
 		                             interpolation,
-		                             wrap);
+		                             extension);
 	}
 }
diff --git a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
new file mode 100644
index 00000000000..903dfcdc881
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_rgb_to_bw(
+	color Color = 0.0,
+	output float Val = 0.0)
+{
+	Val = Color[0] * 0.2126 + Color[1] * 0.7152 + Color[2] * 0.0722;
+}
+
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index e1c7e2cea99..88d6dab04d0 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -31,6 +31,7 @@
 #include "kernel_camera.h"
 
 #include "geom/geom.h"
+#include "bvh/bvh.h"
 
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index aa9c07c867e..44732734c31 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -72,8 +72,16 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 	uint width = info.x;
 	uint height = info.y;
 	uint offset = info.z;
-	uint periodic = (info.w & 0x1);
-	uint interpolation = info.w >> 1;
+
+	/* Image Options */
+	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+	uint extension;
+	if(info.w & (1 << 1))
+		extension = EXTENSION_REPEAT;
+	else if(info.w & (1 << 2))
+		extension = EXTENSION_EXTEND;
+	else
+		extension = EXTENSION_CLIP;
 
 	float4 r;
 	int ix, iy, nix, niy;
@@ -81,22 +89,26 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		svm_image_texture_frac(x*width, &ix);
 		svm_image_texture_frac(y*height, &iy);
 
-		if(periodic) {
+		if(extension == EXTENSION_REPEAT) {
 			ix = svm_image_texture_wrap_periodic(ix, width);
 			iy = svm_image_texture_wrap_periodic(iy, height);
 		}
-		else {
+		else if(extension == EXTENSION_CLIP) {
+			if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f)
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+		else { /* EXTENSION_EXTEND */
 			ix = svm_image_texture_wrap_clamp(ix, width);
 			iy = svm_image_texture_wrap_clamp(iy, height);
-
 		}
+
 		r = svm_image_texture_read(kg, id, offset + ix + iy*width);
 	}
-	else { /* We default to linear interpolation if it is not closest */
+	else { /* INTERPOLATION_LINEAR */
 		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
 		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
 
-		if(periodic) {
+		if(extension == EXTENSION_REPEAT) {
 			ix = svm_image_texture_wrap_periodic(ix, width);
 			iy = svm_image_texture_wrap_periodic(iy, height);
 
@@ -104,14 +116,17 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 			niy = svm_image_texture_wrap_periodic(iy+1, height);
 		}
 		else {
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
 			nix = svm_image_texture_wrap_clamp(ix+1, width);
 			niy = svm_image_texture_wrap_clamp(iy+1, height);
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
 		}
 
-
 		r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width);
 		r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width);
 		r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index ecde2e99a7b..614620c14af 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -1076,6 +1076,26 @@ void ImageManager::device_update_slot(Device *device,
 	}
 }
 
+uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
+{
+	uint8_t options = 0;
+
+	/* Image Options are packed into one uint:
+	 * bit 0 -> Interpolation
+	 * bit 1 + 2  + 3-> Extension */
+	if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST)
+		options |= (1 << 0);
+
+	if(images[type][slot]->extension == EXTENSION_REPEAT)
+		options |= (1 << 1);
+	else if(images[type][slot]->extension == EXTENSION_EXTEND)
+		options |= (1 << 2);
+	else /* EXTENSION_CLIP */
+		options |= (1 << 3);
+
+	return options;
+}
+
 void ImageManager::device_pack_images(Device *device,
                                       DeviceScene *dscene,
                                       Progress& /*progess*/)
@@ -1107,11 +1127,9 @@ void ImageManager::device_pack_images(Device *device,
 
 		device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot];
 
-		/* The image options are packed
-		   bit 0 -> periodic
-		   bit 1 + 2 -> interpolation type */
-		uint8_t interpolation = (images[type][slot]->interpolation << 1) + 1;
-		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, interpolation);
+		uint8_t options = pack_image_options(type, slot);
+
+		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
 
 		memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
@@ -1139,11 +1157,8 @@ void ImageManager::device_pack_images(Device *device,
 
 		/* todo: support 3D textures, only CPU for now */
 
-		/* The image options are packed
-		   bit 0 -> periodic
-		   bit 1 + 2 -> interpolation type */
-		uint8_t interpolation = (images[type][slot]->interpolation << 1) + 1;
-		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, interpolation);
+		uint8_t options = pack_image_options(type, slot);
+		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
 
 		memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 01d02f4dbec..07998684b23 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -122,6 +122,8 @@ private:
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
 	string name_from_type(int type);
 
+	uint8_t pack_image_options(ImageDataType type, size_t slot);
+
 	void device_load_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot, Progress *progess);
 	void device_free_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot);
 
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 764a925983e..661719ed545 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -73,6 +73,37 @@ void Mesh::Curve::bounds_grow(const int k, const float3 *curve_keys, const float
 	bounds.grow(upper, mr);
 }
 
+void Mesh::Curve::bounds_grow(const int k,
+                              const float3 *curve_keys,
+                              const float *curve_radius,
+                              const Transform& aligned_space,
+                              BoundBox& bounds) const
+{
+	float3 P[4];
+
+	P[0] = curve_keys[max(first_key + k - 1,first_key)];
+	P[1] = curve_keys[first_key + k];
+	P[2] = curve_keys[first_key + k + 1];
+	P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
+
+	P[0] = transform_point(&aligned_space, P[0]);
+	P[1] = transform_point(&aligned_space, P[1]);
+	P[2] = transform_point(&aligned_space, P[2]);
+	P[3] = transform_point(&aligned_space, P[3]);
+
+	float3 lower;
+	float3 upper;
+
+	curvebounds(&lower.x, &upper.x, P, 0);
+	curvebounds(&lower.y, &upper.y, P, 1);
+	curvebounds(&lower.z, &upper.z, P, 2);
+
+	float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
+
+	bounds.grow(lower, mr);
+	bounds.grow(upper, mr);
+}
+
 /* Mesh */
 
 NODE_DEFINE(Mesh)
@@ -472,30 +503,19 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
 	}
 }
 
-void Mesh::pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset)
+void Mesh::pack_verts(const vector<uint>& tri_prim_index,
+                      uint4 *tri_vindex,
+                      size_t vert_offset,
+                      size_t tri_offset)
 {
-	size_t verts_size = verts.size();
-
-	if(verts_size) {
-		float3 *verts_ptr = &verts[0];
-
-		for(size_t i = 0; i < verts_size; i++) {
-			float3 p = verts_ptr[i];
-			tri_verts[i] = make_float4(p.x, p.y, p.z, 0.0f);
-		}
-	}
-
-	size_t triangles_size = num_triangles();
-
+	const size_t triangles_size = num_triangles();
 	if(triangles_size) {
 		for(size_t i = 0; i < triangles_size; i++) {
 			Triangle t = get_triangle(i);
-
-			tri_vindex[i] = make_float4(
-				__int_as_float(t.v[0] + vert_offset),
-				__int_as_float(t.v[1] + vert_offset),
-				__int_as_float(t.v[2] + vert_offset),
-				0);
+			tri_vindex[i] = make_uint4(t.v[0] + vert_offset,
+			                           t.v[1] + vert_offset,
+			                           t.v[2] + vert_offset,
+			                           tri_prim_index[i + tri_offset]);
 		}
 	}
 }
@@ -533,7 +553,11 @@ void Mesh::pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, s
 	}
 }
 
-void Mesh::compute_bvh(SceneParams *params, Progress *progress, int n, int total)
+void Mesh::compute_bvh(DeviceScene *dscene,
+                       SceneParams *params,
+                       Progress *progress,
+                       int n,
+                       int total)
 {
 	if(progress->get_cancel())
 		return;
@@ -564,6 +588,7 @@ void Mesh::compute_bvh(SceneParams *params, Progress *progress, int n, int total
 			BVHParams bparams;
 			bparams.use_spatial_split = params->use_bvh_spatial_split;
 			bparams.use_qbvh = params->use_qbvh;
+			bparams.use_unaligned_nodes = dscene->data.bvh.have_curves;
 
 			delete bvh;
 			bvh = BVH::create(bparams, objects);
@@ -1070,42 +1095,82 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 	}
 }
 
-void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+void MeshManager::mesh_calc_offset(Scene *scene)
 {
-	/* count and update offsets */
 	size_t vert_size = 0;
 	size_t tri_size = 0;
-
 	size_t curve_key_size = 0;
 	size_t curve_size = 0;
 
 	foreach(Mesh *mesh, scene->meshes) {
 		mesh->vert_offset = vert_size;
 		mesh->tri_offset = tri_size;
-
 		mesh->curvekey_offset = curve_key_size;
 		mesh->curve_offset = curve_size;
 
 		vert_size += mesh->verts.size();
 		tri_size += mesh->num_triangles();
-
 		curve_key_size += mesh->curve_keys.size();
 		curve_size += mesh->num_curves();
 	}
+}
 
+void MeshManager::device_update_mesh(Device *device,
+                                     DeviceScene *dscene,
+                                     Scene *scene,
+                                     bool for_displacement,
+                                     Progress& progress)
+{
+	/* Count. */
+	size_t vert_size = 0;
+	size_t tri_size = 0;
+	size_t curve_key_size = 0;
+	size_t curve_size = 0;
+	foreach(Mesh *mesh, scene->meshes) {
+		vert_size += mesh->verts.size();
+		tri_size += mesh->num_triangles();
+		curve_key_size += mesh->curve_keys.size();
+		curve_size += mesh->num_curves();
+	}
+	/* Create mapping from triangle to primitive triangle array. */
+	vector<uint> tri_prim_index(tri_size);
+	if(for_displacement) {
+		/* For displacement kernels we do some trickery to make them believe
+		 * we've got all required data ready. However, that data is different
+		 * from final render kernels since we don't have BVH yet, so can't
+		 * really use same semantic of arrays.
+		 */
+		foreach(Mesh *mesh, scene->meshes) {
+			for(size_t i = 0; i < mesh->num_triangles(); ++i) {
+				tri_prim_index[i + mesh->tri_offset] = 3 * (i + mesh->tri_offset);
+			}
+		}
+	}
+	else {
+		PackedBVH& pack = bvh->pack;
+		for(size_t i = 0; i < pack.prim_index.size(); ++i) {
+			if ((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
+				tri_prim_index[pack.prim_index[i]] = pack.prim_tri_index[i];
+			}
+		}
+	}
+	/* Fill in all the arrays. */
 	if(tri_size != 0) {
 		/* normals */
 		progress.set_status("Updating Mesh", "Computing normals");
 
 		uint *tri_shader = dscene->tri_shader.resize(tri_size);
 		float4 *vnormal = dscene->tri_vnormal.resize(vert_size);
-		float4 *tri_verts = dscene->tri_verts.resize(vert_size);
-		float4 *tri_vindex = dscene->tri_vindex.resize(tri_size);
+		uint4 *tri_vindex = dscene->tri_vindex.resize(tri_size);
 
 		foreach(Mesh *mesh, scene->meshes) {
-			mesh->pack_normals(scene, &tri_shader[mesh->tri_offset], &vnormal[mesh->vert_offset]);
-			mesh->pack_verts(&tri_verts[mesh->vert_offset], &tri_vindex[mesh->tri_offset], mesh->vert_offset);
-
+			mesh->pack_normals(scene,
+			                   &tri_shader[mesh->tri_offset],
+			                   &vnormal[mesh->vert_offset]);
+			mesh->pack_verts(tri_prim_index,
+			                 &tri_vindex[mesh->tri_offset],
+			                 mesh->vert_offset,
+			                 mesh->tri_offset);
 			if(progress.get_cancel()) return;
 		}
 
@@ -1114,10 +1179,8 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene
 
 		device->tex_alloc("__tri_shader", dscene->tri_shader);
 		device->tex_alloc("__tri_vnormal", dscene->tri_vnormal);
-		device->tex_alloc("__tri_verts", dscene->tri_verts);
 		device->tex_alloc("__tri_vindex", dscene->tri_vindex);
 	}
-
 	if(curve_size != 0) {
 		progress.set_status("Updating Mesh", "Copying Strands to device");
 
@@ -1132,6 +1195,19 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene
 		device->tex_alloc("__curve_keys", dscene->curve_keys);
 		device->tex_alloc("__curves", dscene->curves);
 	}
+	if(for_displacement) {
+		float4 *prim_tri_verts = dscene->prim_tri_verts.resize(tri_size * 3);
+		foreach(Mesh *mesh, scene->meshes) {
+			for(size_t i = 0; i < mesh->num_triangles(); ++i) {
+				Mesh::Triangle t = mesh->get_triangle(i);
+				size_t offset = 3 * (i + mesh->tri_offset);
+				prim_tri_verts[offset + 0] = float3_to_float4(mesh->verts[t.v[0]]);
+				prim_tri_verts[offset + 1] = float3_to_float4(mesh->verts[t.v[1]]);
+				prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]);
+			}
+		}
+		device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts);
+	}
 }
 
 void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
@@ -1146,6 +1222,7 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	bparams.top_level = true;
 	bparams.use_qbvh = scene->params.use_qbvh;
 	bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
+	bparams.use_unaligned_nodes = dscene->data.bvh.have_curves;
 
 	delete bvh;
 	bvh = BVH::create(bparams, scene->objects);
@@ -1170,9 +1247,13 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->object_node.reference((uint*)&pack.object_node[0], pack.object_node.size());
 		device->tex_alloc("__object_node", dscene->object_node);
 	}
-	if(pack.tri_storage.size()) {
-		dscene->tri_storage.reference(&pack.tri_storage[0], pack.tri_storage.size());
-		device->tex_alloc("__tri_storage", dscene->tri_storage);
+	if(pack.prim_tri_index.size()) {
+		dscene->prim_tri_index.reference((uint*)&pack.prim_tri_index[0], pack.prim_tri_index.size());
+		device->tex_alloc("__prim_tri_index", dscene->prim_tri_index);
+	}
+	if(pack.prim_tri_verts.size()) {
+		dscene->prim_tri_verts.reference((float4*)&pack.prim_tri_verts[0], pack.prim_tri_verts.size());
+		device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts);
 	}
 	if(pack.prim_type.size()) {
 		dscene->prim_type.reference((uint*)&pack.prim_type[0], pack.prim_type.size());
@@ -1273,7 +1354,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 
 	VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
 
-	/* update normals */
+	/* Update normals. */
 	foreach(Mesh *mesh, scene->meshes) {
 		foreach(Shader *shader, mesh->used_shaders) {
 			if(shader->need_update_attributes)
@@ -1289,17 +1370,17 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	}
 
 	/* Update images needed for true displacement. */
-	bool need_displacement_images = false;
+	bool true_displacement_used = false;
 	bool old_need_object_flags_update = false;
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update &&
 		   mesh->displacement_method != Mesh::DISPLACE_BUMP)
 		{
-			need_displacement_images = true;
+			true_displacement_used = true;
 			break;
 		}
 	}
-	if(need_displacement_images) {
+	if(true_displacement_used) {
 		VLOG(1) << "Updating images used for true displacement.";
 		device_update_displacement_images(device, dscene, scene, progress);
 		old_need_object_flags_update = scene->object_manager->need_flags_update;
@@ -1310,49 +1391,52 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 		                                           false);
 	}
 
-	/* device update */
+	/* Device update. */
 	device_free(device, dscene);
 
-	device_update_mesh(device, dscene, scene, progress);
+	mesh_calc_offset(scene);
+	if(true_displacement_used) {
+		device_update_mesh(device, dscene, scene, true, progress);
+	}
 	if(progress.get_cancel()) return;
 
 	device_update_attributes(device, dscene, scene, progress);
 	if(progress.get_cancel()) return;
 
-	/* update displacement */
+	/* Update displacement. */
 	bool displacement_done = false;
-
-	foreach(Mesh *mesh, scene->meshes)
-		if(mesh->need_update && displace(device, dscene, scene, mesh, progress))
+	foreach(Mesh *mesh, scene->meshes) {
+		if(mesh->need_update &&
+		   displace(device, dscene, scene, mesh, progress))
+		{
 			displacement_done = true;
+		}
+	}
 
-	/* todo: properly handle cancel halfway displacement */
+	/* TODO: properly handle cancel halfway displacement */
 	if(progress.get_cancel()) return;
 
-	/* device re-update after displacement */
+	/* Device re-update after displacement. */
 	if(displacement_done) {
 		device_free(device, dscene);
 
-		device_update_mesh(device, dscene, scene, progress);
-		if(progress.get_cancel()) return;
-
 		device_update_attributes(device, dscene, scene, progress);
 		if(progress.get_cancel()) return;
 	}
 
-	/* update bvh */
+	/* Update bvh. */
 	size_t i = 0, num_bvh = 0;
-
-	foreach(Mesh *mesh, scene->meshes)
-		if(mesh->need_update && mesh->need_build_bvh())
+	foreach(Mesh *mesh, scene->meshes) {
+		if(mesh->need_update && mesh->need_build_bvh()) {
 			num_bvh++;
-
+		}
+	}
 	TaskPool pool;
-
 	foreach(Mesh *mesh, scene->meshes) {
 		if(mesh->need_update) {
 			pool.push(function_bind(&Mesh::compute_bvh,
 			                        mesh,
+			                        dscene,
 			                        &scene->params,
 			                        &progress,
 			                        i,
@@ -1362,14 +1446,14 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 			}
 		}
 	}
-
 	TaskPool::Summary summary;
 	pool.wait_work(&summary);
 	VLOG(2) << "Objects BVH build pool statistics:\n"
 	        << summary.full_report();
 
-	foreach(Shader *shader, scene->shaders)
+	foreach(Shader *shader, scene->shaders) {
 		shader->need_update_attributes = false;
+	}
 
 #ifdef __OBJECT_MOTION__
 	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
@@ -1378,18 +1462,23 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	bool motion_blur = false;
 #endif
 
-	/* update obejcts */
+	/* Update objects. */
 	vector<Object *> volume_objects;
-	foreach(Object *object, scene->objects)
+	foreach(Object *object, scene->objects) {
 		object->compute_bounds(motion_blur);
+	}
 
 	if(progress.get_cancel()) return;
 
 	device_update_bvh(device, dscene, scene, progress);
+	if(progress.get_cancel()) return;
+
+	device_update_mesh(device, dscene, scene, false, progress);
+	if(progress.get_cancel()) return;
 
 	need_update = false;
 
-	if(need_displacement_images) {
+	if(true_displacement_used) {
 		/* Re-tag flags for update, so they're re-evaluated
 		 * for meshes with correct bounding boxes.
 		 *
@@ -1405,7 +1494,8 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->bvh_nodes);
 	device->tex_free(dscene->bvh_leaf_nodes);
 	device->tex_free(dscene->object_node);
-	device->tex_free(dscene->tri_storage);
+	device->tex_free(dscene->prim_tri_verts);
+	device->tex_free(dscene->prim_tri_index);
 	device->tex_free(dscene->prim_type);
 	device->tex_free(dscene->prim_visibility);
 	device->tex_free(dscene->prim_index);
@@ -1413,7 +1503,6 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->tri_shader);
 	device->tex_free(dscene->tri_vnormal);
 	device->tex_free(dscene->tri_vindex);
-	device->tex_free(dscene->tri_verts);
 	device->tex_free(dscene->curves);
 	device->tex_free(dscene->curve_keys);
 	device->tex_free(dscene->attributes_map);
@@ -1423,7 +1512,8 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 
 	dscene->bvh_nodes.clear();
 	dscene->object_node.clear();
-	dscene->tri_storage.clear();
+	dscene->prim_tri_verts.clear();
+	dscene->prim_tri_index.clear();
 	dscene->prim_type.clear();
 	dscene->prim_visibility.clear();
 	dscene->prim_index.clear();
@@ -1431,7 +1521,6 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->tri_shader.clear();
 	dscene->tri_vnormal.clear();
 	dscene->tri_vindex.clear();
-	dscene->tri_verts.clear();
 	dscene->curves.clear();
 	dscene->curve_keys.clear();
 	dscene->attributes_map.clear();
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index edad6d32f00..0aea55544f2 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -72,7 +72,15 @@ public:
 
 		int num_segments() { return num_keys - 1; }
 
-		void bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox& bounds) const;
+		void bounds_grow(const int k,
+		                 const float3 *curve_keys,
+		                 const float *curve_radius,
+		                 BoundBox& bounds) const;
+		void bounds_grow(const int k,
+		                 const float3 *curve_keys,
+		                 const float *curve_radius,
+		                 const Transform& aligned_space,
+		                 BoundBox& bounds) const;
 	};
 
 	Curve get_curve(size_t i) const
@@ -167,9 +175,16 @@ public:
 	void add_vertex_normals();
 
 	void pack_normals(Scene *scene, uint *shader, float4 *vnormal);
-	void pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset);
+	void pack_verts(const vector<uint>& tri_prim_index,
+	                uint4 *tri_vindex,
+	                size_t vert_offset,
+	                size_t tri_offset);
 	void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
-	void compute_bvh(SceneParams *params, Progress *progress, int n, int total);
+	void compute_bvh(DeviceScene *dscene,
+	                 SceneParams *params,
+	                 Progress *progress,
+	                 int n,
+	                 int total);
 
 	bool need_attribute(Scene *scene, AttributeStandard std);
 	bool need_attribute(Scene *scene, ustring name);
@@ -213,15 +228,41 @@ public:
 	void update_svm_attributes(Device *device, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes);
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_attributes(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_displacement_images(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
+
 	void device_free(Device *device, DeviceScene *dscene);
 
 	void tag_update(Scene *scene);
+
+protected:
+	/* Calculate verts/triangles/curves offsets in global arrays. */
+	void mesh_calc_offset(Scene *scene);
+
+	void device_update_object(Device *device,
+	                          DeviceScene *dscene,
+	                          Scene *scene,
+	                          Progress& progress);
+
+	void device_update_mesh(Device *device,
+	                        DeviceScene *dscene,
+	                        Scene *scene,
+	                        bool for_displacement,
+	                        Progress& progress);
+
+	void device_update_attributes(Device *device,
+	                              DeviceScene *dscene,
+	                              Scene *scene,
+	                              Progress& progress);
+
+	void device_update_bvh(Device *device,
+	                       DeviceScene *dscene,
+	                       Scene *scene,
+	                       Progress& progress);
+
+	void device_update_displacement_images(Device *device,
+	                                       DeviceScene *dscene,
+	                                       Scene *scene,
+	                                       Progress& progress);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 87020823df9..15b55d17301 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -611,10 +611,10 @@ static float sky_perez_function(float lam[6], float theta, float gamma)
 static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidity)
 {
 	/*
-	* We re-use the SunSky struct of the new model, to avoid extra variables
-	* zenith_Y/x/y is now radiance_x/y/z
-	* perez_Y/x/y is now config_x/y/z
-	*/
+	 * We re-use the SunSky struct of the new model, to avoid extra variables
+	 * zenith_Y/x/y is now radiance_x/y/z
+	 * perez_Y/x/y is now config_x/y/z
+	 */
 	
 	float2 spherical = sky_spherical_coordinates(dir);
 	float theta = spherical.x;
@@ -1596,7 +1596,7 @@ void RGBToBWNode::compile(SVMCompiler& compiler)
 
 void RGBToBWNode::compile(OSLCompiler& compiler)
 {
-	compiler.add(this, "node_convert_from_color");
+	compiler.add(this, "node_rgb_to_bw");
 }
 
 /* Convert */
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index b821e2b6475..925e84ad96d 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -63,7 +63,8 @@ public:
 	device_vector<float4> bvh_nodes;
 	device_vector<float4> bvh_leaf_nodes;
 	device_vector<uint> object_node;
-	device_vector<float4> tri_storage;
+	device_vector<uint> prim_tri_index;
+	device_vector<float4> prim_tri_verts;
 	device_vector<uint> prim_type;
 	device_vector<uint> prim_visibility;
 	device_vector<uint> prim_index;
@@ -72,8 +73,7 @@ public:
 	/* mesh */
 	device_vector<uint> tri_shader;
 	device_vector<float4> tri_vnormal;
-	device_vector<float4> tri_vindex;
-	device_vector<float4> tri_verts;
+	device_vector<uint4> tri_vindex;
 
 	device_vector<float4> curves;
 	device_vector<float4> curve_keys;
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index cef5adc0a61..599222da9c5 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -151,7 +151,7 @@ public:
 		       (isfinite(max.x) && isfinite(max.y) && isfinite(max.z));
 	}
 
-	BoundBox transformed(const Transform *tfm)
+	BoundBox transformed(const Transform *tfm) const
 	{
 		BoundBox result = BoundBox::empty;
 
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index f01db64a79b..6fed18a3db8 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -127,6 +127,19 @@ ccl_device_inline Transform make_transform(float a, float b, float c, float d,
 	return t;
 }
 
+/* Constructs a coordinate frame from a normalized normal. */
+ccl_device_inline Transform make_transform_frame(float3 N)
+{
+	const float3 dx0 = cross(make_float3(1.0f, 0.0f, 0.0f), N);
+	const float3 dx1 = cross(make_float3(0.0f, 1.0f, 0.0f), N);
+	const float3 dx = normalize((dot(dx0,dx0) > dot(dx1,dx1))?  dx0: dx1);
+	const float3 dy = normalize(cross(N, dx));
+	return make_transform(dx.x, dx.y, dx.z, 0.0f,
+	                      dy.x, dy.y, dy.z, 0.0f,
+	                      N.x , N.y,  N.z,  0.0f,
+	                      0.0f, 0.0f, 0.0f, 1.0f);
+}
+
 #ifndef __KERNEL_GPU__
 
 ccl_device_inline Transform operator*(const Transform a, const Transform b)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 972befa185b..257c6ad7491 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,7 @@
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
+#define ccl_restrict __restrict
 #define __KERNEL_WITH_SSE_ALIGN__
 
 #if defined(_WIN32) && !defined(FREE_WINDOWS)